### This file extends the pseudo sequences for HLA I alleles

A summary on the final findings in order to reconstruct the old and construct the extended pseudo sequences is put in file 

    t4_summary.md

We have pseudo sequences for HLA alleles on 19 positions for HLA-II B alleles contained in file

    ../../data/intermediate_data/pseudosequence_2016_all_X.dat

We want to extend the positions to also cover those additional positions got from

    t2_check_additional_pos_contacts.log.ipynb.

First, we need to check whether we can get an unique pseudo sequence for each HLA-II B allele on the additional positions.

If that is true, secondly, we will keep those positions with diversity in amino acids.

In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from collections import defaultdict

Materials:
    
Full sequence files:
    
    ../../data/intermediate_data/HLA_TCR_contact/DRB_prot.alfas

    ../../data/intermediate_data/HLA_TCR_contact/DPB_prot.alfas

    ../../data/intermediate_data/HLA_TCR_contact/DQB_prot.alfas

The 19 positions (needs other additional adjustment. the adjustment needs to follow the findings summarized in 
t3_summary.txt) for HLA-II B alleles from NetMHCIIpan-3.0:

9, 11, 13, 26, 28, 30, 47, 57, 67, 70, 71, 74, 77, 78, 81, 85, 86, 89, 90

9 additional positions (0-indexed) from 

    t2_check_additional_pos_contacts.log.ipynb:

49, 53, 54, 57, 59, 62, 66, 69, 75

pseudo sequences for HLA alleles on 19 positions for HLA-II B alleles contained in file

    ../../data/intermediate_data/pseudosequence_2016_all_X.dat

In [2]:
HLA_2_pseudo = pd.read_csv("../../data/intermediate_data/pseudosequence_2016_all_X.dat", 
                           sep = "\t", header = None)
HLA_2_pseudo.shape
# (5636, 2)
HLA_2_pseudo.columns = ["HLA", "seq"]
HLA_2_pseudo[:6]

Unnamed: 0,HLA,seq
0,DRB1_0101,QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
1,DRB1_0102,QEFFIASGAAVDAIMWLFLECYDLQRATYHAVFT
2,DRB1_0103,QEFFIASGAAVDAIMWLFLECYDIDEATYHVGFT
3,DRB1_0104,QEFFIASGAAVDAIMWLFLECYDLQRANYHVVFT
4,DRB1_0105,QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
5,DRB1_0106,QEFFIASGAAVDAIMWLFLECYDLQAATYHVVFT


In [30]:
# build pseudo sequence dictionary for B allele part
# taking the last 19 positions

# from the exploration in t3_explore_II.ipynb, 
# the only HLA-II pair with two corresponding rows in this table is 'HLA-DPA10103-DPB10601'
# this one is not among the HLA-II pairs that we consider as in HLA v2 table from DeWitt_2018
# so ignore this issue for now and just assign the later pseudo seq to it

HLA_2_pseudo_dict = defaultdict(str)

for hla, seq in zip(HLA_2_pseudo.HLA.tolist(), HLA_2_pseudo.seq.tolist()):
    HLA_2_pseudo_dict[hla] = seq[15:]

len(HLA_2_pseudo_dict)

5635

In [32]:
#HLA_2_pseudo_dict

In [6]:
# the original 19 positions from NetMHC-II-pan-3.0 paper
nineteen = [9, 11, 13, 26, 28, 30, 47, 57, 67, 70, 71, 74, 77, 78, 81, 85, 86, 89, 90]
nineteen

[9, 11, 13, 26, 28, 30, 47, 57, 67, 70, 71, 74, 77, 78, 81, 85, 86, 89, 90]

In [7]:
# get HLA-II pair names from HLA_v2_features
HLA_v2_features_row_names = pd.read_csv("../../data/intermediate_data/DeWitt_2018/HLA_v2_features_row_names.txt", 
                                        sep = " ", header = None)
HLA_v2_features_row_names.columns = ["feature", "hla"]
HLA_v2_features_row_names.shape
# (215, 2)
HLA_v2_features_row_names[:6]

Unnamed: 0,feature,hla
0,feature:,HLA-DPAB*02:01_04:01
1,feature:,HLA-DQAB*05:05_06:04
2,feature:,HLA-B*08:01
3,feature:,HLA-A*24:02
4,feature:,HLA-A*24:03
5,feature:,HLA-B*38:02


In [16]:
HLA_II_v2_pairs = [hla for hla in HLA_v2_features_row_names.hla.tolist() if hla[:7] in ["HLA-DPA", "HLA-DQA", "HLA-DRD", "HLA-DRB"]]

In [24]:
HLA_II_v2_5DRDQ = [item for item in HLA_II_v2_pairs if len(item.split("_")) > 2]
HLA_II_v2_5DRDQ

HLA_II_v2_5DRDQ_DRB = ["HLA-DRB1*" + item[9:].split("_")[0] for item in HLA_II_v2_5DRDQ]
HLA_II_v2_5DRDQ_DQAB = ["HLA-DQAB*" + "_".join(item[9:].split("_")[1:]) for item in HLA_II_v2_5DRDQ]
HLA_II_v2_5DRDQ_DQAB

HLA_II_complete = list(set(HLA_II_v2_pairs + HLA_II_v2_5DRDQ_DRB + HLA_II_v2_5DRDQ_DQAB) - set(HLA_II_v2_5DRDQ))
len(HLA_II_complete)

135

In [27]:
# reconstruct pseudo sequences on the 19 positions of beta chain

# map each HLA-II pair to the corresponding B allele
# load the corresponding pseudo sequeneces and get the last 19
# reconstruct the pseudo sequences based on position adjustments
#  -- load the three full sequences files
#  -- the lists of DPBs with exception
#  -- write functions to reconstruct pseudo sequences
# check whether for each HLA-II B allele, the pseudo sequences on the 9 additional positions
#  are the same too
# if so, check how many out of these 9 positions have diverity
# if so, move on and write out a file of the B allele names and their corresponding pseudo 
#  sequences on the original 19 positions and the additional positions with diversity

In [33]:
# this first dictionary holds the corresponding B allele of each HLA-II pair
HLA_II_beta_dict = defaultdict(str)
# this second dictionary holds the translate of one HLA-II pair to the names in "../../data/intermediate_data/pseudosequence_2016_all_X.dat"
trans_hla_II_dict = defaultdict(str)
# based on the existing pseudo sequence dictionary, build one for B allele and possible
# corresponding set of pseudo sequences -- expected to be set of len 1 if no bug
HLA_II_beta_set_pseudo_dict = defaultdict(set)

# separate the HLA-II pairs into two alleles each
# translate them into the names in file "../../data/intermediate_data/pseudosequence_2016_all_X.dat"
for item in HLA_II_complete:
    if item[:8] == "HLA-DQAB":
        item_1 = "DQA1" + "*" + item[9:].split("_")[0]
        item_2 = "DQB1" + "*" + item[9:].split("_")[1]
        HLA_II_beta_dict[item] = item_2
        trans_hla_II_dict[item] = "HLA-" + item_1.replace("*", "").replace(":", "") + "-" + item_2.replace("*", "").replace(":", "")
        HLA_II_beta_set_pseudo_dict[item_2].add(HLA_2_pseudo_dict[trans_hla_II_dict[item]])
    elif item[:8] == "HLA-DPAB":
        item_1 = "DPA1" + "*" + item[9:].split("_")[0]
        item_2 = "DPB1" + "*" + item[9:].split("_")[1]
        HLA_II_beta_dict[item] = item_2
        trans_hla_II_dict[item] = "HLA-" + item_1.replace("*", "").replace(":", "") + "-" + item_2.replace("*", "").replace(":", "")
        HLA_II_beta_set_pseudo_dict[item_2].add(HLA_2_pseudo_dict[trans_hla_II_dict[item]])
    elif item[:8] == "HLA-DRB1":
        item_1 = "DRA"
        item_2 = "DRB1" + "*" + item[9:]
        HLA_II_beta_dict[item] = item_2
        trans_hla_II_dict[item] = 'DRB1_' + item[9:].replace(":", "")
        HLA_II_beta_set_pseudo_dict[item_2].add(HLA_2_pseudo_dict[trans_hla_II_dict[item]])
    else:
        print("error found, first eight letters exception")
        print(item)
        break

In [46]:
# now it has been verified that each HLA-II B allele has unique pseudo sequence from
# file "../../data/intermediate_data/pseudosequence_2016_all_X.dat"
len(set(HLA_II_beta_dict.values()))
# 62
list_HLA_II_beta = list(set(HLA_II_beta_dict.values()))
list_HLA_II_beta.sort()
list_HLA_II_beta[:10]

['DPB1*01:01',
 'DPB1*02:01',
 'DPB1*03:01',
 'DPB1*04:01',
 'DPB1*04:02',
 'DPB1*05:01',
 'DPB1*10:01',
 'DPB1*11:01',
 'DPB1*13:01',
 'DPB1*17:01']

In [49]:
# use HLA_II_beta_set_pseudo_dict to get a dict with pseudo seq as value
HLA_II_beta_pseudo_dict = defaultdict(str)

for key in HLA_II_beta_set_pseudo_dict:
    HLA_II_beta_pseudo_dict[key] = list(HLA_II_beta_set_pseudo_dict[key])[0]

len(HLA_II_beta_pseudo_dict)

62

Load the full sequences files

Load DRB sequences

In [53]:
DRB_prot = pd.read_csv("../../data/intermediate_data/HLA_TCR_contact/DRB_prot.alfas", 
                       sep = " ", header = None)

name_ind = list(range(int(DRB_prot.shape[0]/2)))
names = DRB_prot.loc[[2 * ind for ind in name_ind]]
names = names.iloc[:, 0].tolist()
names = [name.replace(">", "") for name in names]
seqs = DRB_prot.loc[[2 * ind + 1 for ind in name_ind]]
seqs = seqs.iloc[:, 0].tolist()

DRB_seqs = pd.DataFrame(list(zip(names, seqs)), 
                            columns =['name', 'seq']) 
DRB_seqs.shape
# (2238, 2)
DRB_seqs['short'] = [":".join(name.split(":")[:2]) for name in DRB_seqs.name.tolist()]
DRB_seq_dict = defaultdict(set)
for short, seq in zip(DRB_seqs.short.tolist(), DRB_seqs.seq.tolist()):
    DRB_seq_dict[short].add(seq)

len(DRB_seq_dict)

1709

Load DPB1 sequences

In [81]:
DPB1_prot = pd.read_csv("../../data/intermediate_data/HLA_TCR_contact/DPB1_prot.alfas", 
                        sep = " ", header = None)
name_ind = list(range(int(DPB1_prot.shape[0]/2)))
names = DPB1_prot.loc[[2 * ind for ind in name_ind]]
names = names.iloc[:, 0].tolist()
names = [name.replace(">", "") for name in names]
seqs = DPB1_prot.loc[[2 * ind + 1 for ind in name_ind]]
seqs = seqs.iloc[:, 0].tolist()

DPB1_seqs = pd.DataFrame(list(zip(names, seqs)), 
                            columns =['name', 'seq']) 
DPB1_seqs.shape
# (873, 2)
DPB1_seqs['short'] = [":".join(name.split(":")[:2]) for name in DPB1_seqs.name.tolist()]
DPB1_seq_dict = defaultdict(set)
for short, seq in zip(DPB1_seqs.short.tolist(), DPB1_seqs.seq.tolist()):
    DPB1_seq_dict[short].add(seq)

len(DPB1_seq_dict)

641

Load DQB1 sequences

In [82]:
DQB1_prot = pd.read_csv("../../data/intermediate_data/HLA_TCR_contact/DQB1_prot.alfas", 
                        sep = " ", header = None)
name_ind = list(range(int(DQB1_prot.shape[0]/2)))
names = DQB1_prot.loc[[2 * ind for ind in name_ind]]
names = names.iloc[:, 0].tolist()
names = [name.replace(">", "") for name in names]
seqs = DQB1_prot.loc[[2 * ind + 1 for ind in name_ind]]
seqs = seqs.iloc[:, 0].tolist()

DQB1_seqs = pd.DataFrame(list(zip(names, seqs)), 
                            columns =['name', 'seq']) 
DQB1_seqs.shape
# (1045, 2)
DQB1_seqs['short'] = [":".join(name.split(":")[:2]) for name in DQB1_seqs.name.tolist()]
DQB1_seq_dict = defaultdict(set)
for short, seq in zip(DQB1_seqs.short.tolist(), DQB1_seqs.seq.tolist()):
    DQB1_seq_dict[short].add(seq)

len(DQB1_seq_dict)

748

In [60]:
# set of DPBs that need additional modifications
extra_modify_DPBs = ['DPB1*17:01', 'DPB1*10:01']

Next we proceed to verify reconstructed pseduo sequences based on 19 positions. 

The adjustments needed for alignment:

(0) All positions from NetMHCIIpan-3.0 should be subtracted by 7 in order to match the full sequence files, 

which means from 

9, 11, 13, 26, 28, 30, 47, 57, 67, 70, 71, 74, 77, 78, 81, 85, 86, 89, 90

to

2, 4, 6, 19, 21, 23, 40, 50, 60, 63, 64, 67, 70, 71, 74, 78, 79, 82, 83.

(1) For two DPBs DPB1*17:01 and DPB1*10:01, the first position needs to be moved by 2 positions to the left, 

which means from 

2, 4, 6, 19, 21, 23, 40, 50, 60, 63, 64, 67, 70, 71, 74, 78, 79, 82, 83

to 

0, 4, 6, 19, 21, 23, 40, 50, 60, 63, 64, 67, 70, 71, 74, 78, 79, 82, 83.

(2) If a B allele has multiple corresponding pseudo sequences, drop those containing "X".

In [64]:
# reconstruct a pseudo sequence dictionary from full sequence data



HLA_II_beta_pseudo_rec_dict = defaultdict(str)


pos_beta = [pos for pos in nineteen]

def get_b_half_modify_19(lookup_dict, allele, pos_beta, modify_flag):
    seq_candids = list(lookup_dict[allele])
    if modify_flag:
        seq_pseudos_modify = list(set(["".join([item[ind-7] for ind in pos_beta]) for item in seq_candids]))
        seq_pseudos = ['Y'+item[1:] for item in seq_pseudos_modify]
    else:
        seq_pseudos = list(set(["".join([item[ind-7] for ind in pos_beta]) for item in seq_candids]))
    seq_pseudo_noX = [seq for seq in seq_pseudos if "X" not in seq]
    if len(seq_pseudo_noX) == 1:
        return True, seq_pseudo_noX[0]
    else:
        return False, ""
    
    
for item in list_HLA_II_beta:
    if item[:3] == "DQB":
        sub_flag, seq = get_b_half_modify_19(DQB1_seq_dict, item, pos_beta, False)
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_rec_dict[item] = seq     
    elif item[:3] == "DPB":
        modify_flag = (item in extra_modify_DPBs)
        sub_flag, seq = get_b_half_modify_19(DPB1_seq_dict, item, pos_beta, modify_flag)
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_rec_dict[item] = seq     
    elif item[:3] == "DRB":
        sub_flag, seq = get_b_half_modify_19(DRB_seq_dict, item, pos_beta, False)
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_rec_dict[item] = seq  
    else:
        print("error found, first three letters exception")
        print(item)
        break

In [67]:
# thus we have verified that this way of reconstructing pseudo sequences
# gets results that match those from file "../../data/intermediate_data/pseudosequence_2016_all_X.dat"
HLA_II_beta_pseudo_rec_dict == HLA_II_beta_pseudo_dict

True

In [69]:
DPB1_seq_dict['DPB1*01:01']

{'YVYQGRQECYAFN--GTQRFLERYIYNREEYARFDSDVGEFRAVTELGRPAAEYWNSQKDILEEKRAVPDRVCRHNYELDEAVTL'}

In [70]:
DPB1_seq_dict['DPB1*10:01']

{'YVHQLRQECYAFN--GTQRFLERYIYNREEFVRFDSDVGEFRAVTELGRPDEEYWNSQKDILEEERAVPDRVCRHNYELDEAVTL'}

In [71]:
DPB1_seq_dict['DPB1*17:01']

{'YVHQLRQECYAFN--GTQRFLERYIYNREEFVRFDSDVGEFRAVTELGRPDEDYWNSQKDILEEERAVPDRMCRHNYELDEAVTL'}

In [73]:
[ind - 7 for ind in nineteen]

[2, 4, 6, 19, 21, 23, 40, 50, 60, 63, 64, 67, 70, 71, 74, 78, 79, 82, 83]

In [74]:
HLA_II_beta_pseudo_dict['DPB1*01:01']
'YVYQGRQECYAFN--GTQRFLERYIYNREEYARFDSDVGEFRAVTELGRPAAEYWNSQKDILEEKRAVPDRVCRHNYELDEAVTL'

'YGQFEYFAIEKVRVHLDVT'

In [75]:
HLA_II_beta_pseudo_dict['DPB1*10:01']
'YVHQLRQECYAFN--GTQRFLERYIYNREEFVRFDSDVGEFRAVTELGRPDEEYWNSQKDILEEERAVPDRVCRHNYELDEAVTL'

'YLQFEYFDIEEVRVHLDVT'

In [76]:
HLA_II_beta_pseudo_dict['DPB1*17:01']
'YVHQLRQECYAFN--GTQRFLERYIYNREEFVRFDSDVGEFRAVTELGRPDEDYWNSQKDILEEERAVPDRMCRHNYELDEAVTL'

'YLQFEYFDIEEVRMHLDVT'

In [83]:
#list_HLA_II_beta[:10]

In [84]:
#[DPB1_seq_dict[key] for key in list_HLA_II_beta]

In [86]:
# 9 additional positions (index adjusted) for HLA-II alpha chain:
nine = [49, 53, 54, 57, 59, 62, 66, 69, 75]
# add 7 to all to take them on the same scale as the original fifteen
up_nine = [ind + 7 for ind in nine]
up_nine

[56, 60, 61, 64, 66, 69, 73, 76, 82]

Assume that the two extra_modify_DPBs will not need additional adjustment after the very beginning part of the sequences.

In [87]:
# now let's check whether for each HLA-II B alleles, we can find a unique pseudo 
# sequence for it based on these 9 positions


flag_HLA_II_on_9 = []
HLA_II_beta_pseudo_9_dict = defaultdict(str)


pos_up_9 = [pos for pos in up_nine]

def get_b_half_modify_9(lookup_dict, allele, pos_up_9):
    seq_candids = list(lookup_dict[allele])
    seq_pseudos = list(set(["".join([item[ind-7] for ind in pos_up_9]) for item in seq_candids]))
    seq_pseudo_noX = [seq for seq in seq_pseudos if "X" not in seq]
    if len(seq_pseudo_noX) == 1:
        return True, seq_pseudo_noX[0]
    else:
        return False, ""
    
    
for item in list_HLA_II_beta:
    if item[:3] == "DQB":
        sub_flag, seq = get_b_half_modify_9(DQB1_seq_dict, item, pos_up_9)
        flag_HLA_II_on_9 += [sub_flag]
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_9_dict[item] = seq     
    elif item[:3] == "DPB":
        sub_flag, seq = get_b_half_modify_9(DPB1_seq_dict, item, pos_up_9)
        flag_HLA_II_on_9 += [sub_flag]
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_9_dict[item] = seq     
    elif item[:3] == "DRB":
        sub_flag, seq = get_b_half_modify_9(DRB_seq_dict, item, pos_up_9)
        flag_HLA_II_on_9 += [sub_flag]
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_9_dict[item] = seq  
    else:
        print("error found, first three letters exception")
        print(item)
        break

In [90]:
sum(flag_HLA_II_on_9)/len(flag_HLA_II_on_9)

1.0

In [95]:
# now we have verified that we can construct one unique pseudo sequence
# on the nine additional positions for each HLA-II B allele in 
# the HLA-II pairs in HLA v2 matrix of DeWitt_2018

# next we check how many of these 9 additional positions have diversity
# in terms of amino acids across the 62 HLA-II B alleles

nunique_9 = []

for i in range(9):
    cur_aas = ''
    for value in list(HLA_II_beta_pseudo_9_dict.values()):
        cur_aas += value[i]
    nunique_9 += [len(set(cur_aas))]

nunique_9

[2, 3, 1, 1, 2, 1, 2, 1, 1]

In [96]:
# 4 out of 9 positions have diverity
add_HLA_II_4 = [up_nine[i] for i in range(9) if nunique_9[i] > 1]
add_HLA_II_4

[56, 60, 66, 73]

Combine the 4 additional positions with the original 19 to make a set of 23 positions.

In [101]:
extended_HLA_II_beta = nineteen + add_HLA_II_4
extended_HLA_II_beta.sort()
#extended_HLA_II_beta
# [9,11,13,26,28,30,47,56,57,60,66,67,70,71,73,74,77,78,81,85,86,89,90]

In [104]:
# now move on to get pseudo sequences based on these 23 positions:

# three adjustments mentioned in earlier part of this file is also used here

flag_HLA_II_on_23 = []
HLA_II_beta_pseudo_23_dict = defaultdict(str)


pos_up_23 = [pos for pos in extended_HLA_II_beta]

def get_b_half_modify_23(lookup_dict, allele, pos_up_23, modify_flag):
    seq_candids = list(lookup_dict[allele])
    if modify_flag:
        seq_pseudos_modify = list(set(["".join([item[ind-7] for ind in pos_up_23]) for item in seq_candids]))
        seq_pseudos = ['Y'+item[1:] for item in seq_pseudos_modify]
    else:
        seq_pseudos = list(set(["".join([item[ind-7] for ind in pos_up_23]) for item in seq_candids]))
    seq_pseudo_noX = [seq for seq in seq_pseudos if "X" not in seq]
    if len(seq_pseudo_noX) == 1:
        return True, seq_pseudo_noX[0]
    else:
        return False, ""
    
for item in list_HLA_II_beta:
    if item[:3] == "DQB":
        sub_flag, seq = get_b_half_modify_23(DQB1_seq_dict, item, pos_up_23, False)
        flag_HLA_II_on_23 += [sub_flag]
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_23_dict[item] = seq     
    elif item[:3] == "DPB":
        modify_flag = (item in extra_modify_DPBs)
        sub_flag, seq = get_b_half_modify_23(DPB1_seq_dict, item, pos_up_23, modify_flag)
        flag_HLA_II_on_23 += [sub_flag]
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_23_dict[item] = seq     
    elif item[:3] == "DRB":
        sub_flag, seq = get_b_half_modify_23(DRB_seq_dict, item, pos_up_23, False)
        flag_HLA_II_on_23 += [sub_flag]
        if not sub_flag:
            print("seq_pseudo_noX len is not 1")
            print("item = ", item)
            break
        HLA_II_beta_pseudo_23_dict[item] = seq  
    else:
        print("error found, first three letters exception")
        print(item)
        break

In [105]:
sum(flag_HLA_II_on_23)/len(flag_HLA_II_on_23)

1.0

In [107]:
len(HLA_II_beta_pseudo_23_dict)

62

In [108]:
HLA_II_beta_pseudo_23_value = [HLA_II_beta_pseudo_23_dict[key] for key in list_HLA_II_beta]

df_HLA_II_beta_23 = pd.DataFrame(list(zip(list_HLA_II_beta, HLA_II_beta_pseudo_23_value)), \
                                 columns = ["allele", "seq"])
df_HLA_II_beta_23.shape
# (62, 2)

In [111]:
# write the dictionary out in the format of a table
df_HLA_II_beta_23.to_csv("../../data/intermediate_data/t4_HLA_II_v2_beta_pseudo_23_dict.csv", index = False)

In [112]:
# go and check a few HLA-II B alleles

In [116]:
#'DPB1*01:01' 'YGQFEYFAIEKVRVHLDVT'
''.join(['YGQFEYFPAYDIEKAVRVHLDVT'[i] for i in range(23) if extended_HLA_II_beta[i] in nineteen])\
   == 'YGQFEYFAIEKVRVHLDVT'

True

In [121]:
# 'DPB1*10:01': 'YLQFEYFDIEEVRVHLDVT'
''.join(['YLQFEYFPDYDIEEAVRVHLDVT'[i] for i in range(23) if extended_HLA_II_beta[i] in nineteen])\
   == 'YLQFEYFDIEEVRVHLDVT'

True

In [123]:
# 'DPB1*17:01': 'YLQFEYFDIEEVRMHLDVT'
''.join(['YLQFEYFPDYDIEEAVRMHLDVT'[i] for i in range(23) if extended_HLA_II_beta[i] in nineteen])\
   == 'YLQFEYFDIEEVRMHLDVT'

True

In [125]:
# 'DQB1*06:02': 'FFGLTYYDVGTETVHVAGI',
''.join(['FFGLTYYPDYEVGTAETVHVAGI'[i] for i in range(23) if extended_HLA_II_beta[i] in nineteen])\
   == 'FFGLTYYDVGTETVHVAGI'

True

In [127]:
# 'DRB1*12:02': 'ESGLEHFVFDRATYHAVFT',
''.join(['ESGLEHFPVSDFDRAATYHAVFT'[i] for i in range(23) if extended_HLA_II_beta[i] in nineteen])\
   == 'ESGLEHFVFDRATYHAVFT'

True