Extract the short pseudo sequence (34 amimo acids) from MHC_pseudo.txt for 85 HLA-I alleles in DeWitt 2018. 

In [1]:
import os

import numpy as np
import pandas as pd

from collections import Counter

In [2]:
data_dir = "../../data/intermediate_data"
MHC_file = "MHC_pseudo.txt"
MHC_path = os.path.join(data_dir, MHC_file)
MHC_pseudo = pd.read_csv(MHC_path, sep=' ', header= None)

In [3]:
MHC_pseudo.shape

(5327, 2)

In [4]:
MHC_pseudo.columns = ["name", "seq"]
MHC_pseudo[0:9]

Unnamed: 0,name,seq
0,BoLA-1:00901,YYSMYREISENVYGSNLYLLYRDYTWEYLNYRWY
1,BoLA-1:00902,YYSEYREISENVYESNLYLLYRDYTWEYLNYRWY
2,BoLA-1:01901,YHTKYREISENVYGSNLYYDYDYYTWAVFNYRGY
3,BoLA-1:02001,YHTKYREISENVYGSNLYFLYMDYTWAVFNYRGY
4,BoLA-1:02101,YYTKYREISENVYGSNLYFQFRYYTWADFNYEGY
5,BoLA-1:02301,YYSEYREISENVYESNLYIAYSDYTWEYLNYRWY
6,BoLA-1:02801,YYTKYREISEKLYENTLYLQFRYYTWADFNYEWY
7,BoLA-1:02901,YYTRYREISENLYKNTAYITFMYYTWANENYRGY
8,BoLA-1:03101,YYTKYDEISENLYKNTLYIAFRDYTWAYLNYTWY


In [5]:
MHC_pseudo.nunique()

name    5315
seq     2125
dtype: int64

In [6]:
dup_rows = MHC_pseudo[MHC_pseudo.duplicated(subset=['name'],keep=False)]
dup_rows.shape

(24, 2)

In [7]:
# none of the rows duplicated wrt name column is from HLA
# ignore it for now
dup_rows

Unnamed: 0,name,seq
4598,Mamu-A01,YYAMYRENMTENAVNTLYLRVEYYTWAVMAYQWY
4599,Mamu-A01,YYAMYRENMTENAVNTLYLRVEYYTWAVMAYQWY
4600,Mamu-A02,YYAMYRENMAENAVNNLYIRYHSYTWAEHTYEWY
4601,Mamu-A02,YYAMYRENMAENAVNNLYIRYHSYTWAEHTYEWY
4612,Mamu-A07,YYSEYRNICANTYESNLYIRYEFYTWAAMAYEWH
4613,Mamu-A07,YYSEYRNICANTYESNLYIRYEFYTWAAMAYEWH
4615,Mamu-A11,YHTKYREISANTYENTAYFTYDYYTWAVHTYEWY
4616,Mamu-A11,YHTKYREISANTYENTAYFTYDYYTWAVHTYEWY
4855,Mamu-B01,YHSMYREKAGNTDENIAYLMHYRYTWAVRAYRWY
4856,Mamu-B01,YHSMYREKAGNTDENIAYLMHYRYTWAVRAYRWY


In [8]:
dup_rows.name.nunique()

12

In [9]:
dup_rows.shape[0] + MHC_pseudo.name.nunique() - dup_rows.name.nunique()

5327

In [None]:
# subset the HLA related rows

In [10]:
MHC_pseudo.name[0][0:3]

'BoL'

In [11]:
MHC_pseudo['first_three'] = MHC_pseudo['name'].str[:3]
MHC_pseudo[:6]

Unnamed: 0,name,seq,first_three
0,BoLA-1:00901,YYSMYREISENVYGSNLYLLYRDYTWEYLNYRWY,BoL
1,BoLA-1:00902,YYSEYREISENVYESNLYLLYRDYTWEYLNYRWY,BoL
2,BoLA-1:01901,YHTKYREISENVYGSNLYYDYDYYTWAVFNYRGY,BoL
3,BoLA-1:02001,YHTKYREISENVYGSNLYFLYMDYTWAVFNYRGY,BoL
4,BoLA-1:02101,YYTKYREISENVYGSNLYFQFRYYTWADFNYEGY,BoL
5,BoLA-1:02301,YYSEYREISENVYESNLYIAYSDYTWEYLNYRWY,BoL


In [12]:
Counter(MHC_pseudo['first_three'])

Counter({'BoL': 181,
         'Chi': 3,
         'Gog': 1,
         'H-2': 9,
         'H2-': 8,
         'HLA': 4396,
         'Mam': 508,
         'Pat': 105,
         'SLA': 116})

In [13]:
HLA_pseudo = MHC_pseudo.loc[MHC_pseudo['first_three'] == 'HLA']
HLA_pseudo.shape

(4396, 3)

In [14]:
HLA_pseudo[0:9]

Unnamed: 0,name,seq,first_three
202,HLA-A0101,YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGY,HLA
203,HLA-A0102,YSAMYQENMAHTDANTLYIIYRDYTWVARVYRGY,HLA
204,HLA-A0103,YFAMYQENMAHTDANTLYIMYRDYTWVARVYRGY,HLA
205,HLA-A0104,YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGY,HLA
206,HLA-A0106,YFAMYQENMAHTDANTLYIIYRDYTWVALAYRGY,HLA
207,HLA-A0107,YFAMYQENVAHTDENTLYIIYRDYTWVARVYRGY,HLA
208,HLA-A0108,YFAMYQENMAHTDANTLYIIYRDYTWVARVYWGY,HLA
209,HLA-A0109,YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGY,HLA
210,HLA-A0110,YFAMYQENMAHTDANTLYIIYRDYTWARRVYRGY,HLA


In [15]:
HLA_pseudo.nunique()

name           4396
seq            1577
first_three       1
dtype: int64

In [16]:
HLA_pseudo.name[:10]

202    HLA-A0101
203    HLA-A0102
204    HLA-A0103
205    HLA-A0104
206    HLA-A0106
207    HLA-A0107
208    HLA-A0108
209    HLA-A0109
210    HLA-A0110
211    HLA-A0111
Name: name, dtype: object

Load HLA information v2 from DeWitt 2018

In [20]:
HLA_2_file = "../../data/intermediate_data/DeWitt_2018/HLA_v2_features.txt"

In [21]:
# use hard coded directory for now

cmd2 = "cut -f1,2 -d ' ' ../../data/intermediate_data/DeWitt_2018/HLA_v2_features.txt > ../../data/intermediate_data/DeWitt_2018/HLA_v2_features_row_names.txt"
os.system(cmd2)

0

In [22]:
HLA_v2_rname = pd.read_csv("../../data/intermediate_data/DeWitt_2018/HLA_v2_features_row_names.txt", 
                     sep = " ",
                     header = None)
HLA_v2_rname.shape

(215, 2)

In [23]:
HLA_v2_rname.columns = ["feature", "name"]
HLA_v2_rname[:10]

Unnamed: 0,feature,name
0,feature:,HLA-DPAB*02:01_04:01
1,feature:,HLA-DQAB*05:05_06:04
2,feature:,HLA-B*08:01
3,feature:,HLA-A*24:02
4,feature:,HLA-A*24:03
5,feature:,HLA-B*38:02
6,feature:,HLA-DPAB*02:01_04:02
7,feature:,HLA-DRDQ*10:01_01:05_05:01
8,feature:,HLA-B*51:01
9,feature:,HLA-B*57:01


In [24]:
HLA_v2_rname.nunique()

feature      1
name       215
dtype: int64

In [25]:
HLA_v2_set = set(HLA_v2_rname.name)
len(HLA_v2_set)

215

now look at how many different general allele types there are in HLA_v2_set and the counts of each one

In [27]:
HLA_v2_list = list(HLA_v2_set)
HLA_v2_list.sort()
general_type_2 = [item.split("*")[0] for item in HLA_v2_list]
Counter(general_type_2)

Counter({'HLA-A': 24,
         'HLA-B': 40,
         'HLA-C': 21,
         'HLA-DPAB': 25,
         'HLA-DQAB': 67,
         'HLA-DRB1': 33,
         'HLA-DRDQ': 5})

move on to check the intersection of HLA_v2 with the HLA pseudo sequence and how to deal with combined types

In [28]:
HLA_rm_star_v2_list = [item.replace("*", "") for item in HLA_v2_list]
HLA_rm_star_v2_list

['HLA-A01:01',
 'HLA-A02:01',
 'HLA-A02:05',
 'HLA-A02:06',
 'HLA-A03:01',
 'HLA-A03:02',
 'HLA-A11:01',
 'HLA-A23:01',
 'HLA-A24:02',
 'HLA-A24:03',
 'HLA-A25:01',
 'HLA-A26:01',
 'HLA-A29:01',
 'HLA-A29:02',
 'HLA-A30:01',
 'HLA-A30:02',
 'HLA-A31:01',
 'HLA-A32:01',
 'HLA-A33:01',
 'HLA-A33:03',
 'HLA-A34:01',
 'HLA-A66:01',
 'HLA-A68:01',
 'HLA-A68:02',
 'HLA-B07:02',
 'HLA-B07:05',
 'HLA-B08:01',
 'HLA-B13:02',
 'HLA-B14:01',
 'HLA-B14:02',
 'HLA-B15:01',
 'HLA-B15:03',
 'HLA-B15:07',
 'HLA-B15:17',
 'HLA-B15:18',
 'HLA-B18:01',
 'HLA-B27:05',
 'HLA-B35:01',
 'HLA-B35:02',
 'HLA-B35:03',
 'HLA-B35:08',
 'HLA-B37:01',
 'HLA-B38:01',
 'HLA-B38:02',
 'HLA-B39:01',
 'HLA-B39:06',
 'HLA-B40:01',
 'HLA-B40:02',
 'HLA-B40:06',
 'HLA-B41:01',
 'HLA-B41:02',
 'HLA-B44:02',
 'HLA-B44:03',
 'HLA-B45:01',
 'HLA-B48:01',
 'HLA-B49:01',
 'HLA-B50:01',
 'HLA-B51:01',
 'HLA-B52:01',
 'HLA-B53:01',
 'HLA-B55:01',
 'HLA-B56:01',
 'HLA-B57:01',
 'HLA-B58:01',
 'HLA-C01:02',
 'HLA-C02:02',
 'HLA-C03:

In [29]:
len(set(HLA_rm_star_v2_list).intersection(set(HLA_pseudo.name)))

85

In [30]:
set(HLA_rm_star_v2_list) - set(HLA_pseudo.name)

{'HLA-DPAB01:03_01:01',
 'HLA-DPAB01:03_02:01',
 'HLA-DPAB01:03_03:01',
 'HLA-DPAB01:03_04:01',
 'HLA-DPAB01:03_04:02',
 'HLA-DPAB01:03_05:01',
 'HLA-DPAB01:03_10:01',
 'HLA-DPAB01:03_11:01',
 'HLA-DPAB01:03_13:01',
 'HLA-DPAB01:03_17:01',
 'HLA-DPAB02:01_01:01',
 'HLA-DPAB02:01_02:01',
 'HLA-DPAB02:01_03:01',
 'HLA-DPAB02:01_04:01',
 'HLA-DPAB02:01_04:02',
 'HLA-DPAB02:01_05:01',
 'HLA-DPAB02:01_10:01',
 'HLA-DPAB02:01_11:01',
 'HLA-DPAB02:01_13:01',
 'HLA-DPAB02:01_17:01',
 'HLA-DPAB02:02_01:01',
 'HLA-DPAB02:02_02:01',
 'HLA-DPAB02:02_04:01',
 'HLA-DPAB02:02_04:02',
 'HLA-DPAB02:02_05:01',
 'HLA-DQAB01:01_02:02',
 'HLA-DQAB01:01_03:01',
 'HLA-DQAB01:01_03:02',
 'HLA-DQAB01:01_05:01',
 'HLA-DQAB01:01_05:03',
 'HLA-DQAB01:01_06:02',
 'HLA-DQAB01:01_06:03',
 'HLA-DQAB01:02_02:01',
 'HLA-DQAB01:02_02:02',
 'HLA-DQAB01:02_03:01',
 'HLA-DQAB01:02_03:02',
 'HLA-DQAB01:02_03:03',
 'HLA-DQAB01:02_04:02',
 'HLA-DQAB01:02_05:01',
 'HLA-DQAB01:02_05:02',
 'HLA-DQAB01:02_06:03',
 'HLA-DQAB01:02_

### constrained to HLA-I pseudo sequence for now

In [None]:
# double check whether HLA_pseudo only contains Allele-I

In [31]:
HLA_pseudo_list = HLA_pseudo.name.tolist()
HLA_pseudo_list_rm_colon = [item.replace(":", "") for item in HLA_pseudo_list]

HLA_pseudo_general_type = [''.join([i for i in item if not i.isdigit()]) 
                           for item in HLA_pseudo_list_rm_colon]

len(HLA_pseudo_general_type)

4396

In [32]:
Counter(HLA_pseudo_general_type)

Counter({'HLA-A': 1364, 'HLA-B': 2204, 'HLA-C': 808, 'HLA-E': 4, 'HLA-G': 16})

In [237]:
# we have verified that HLA_pseudo only contains HLA-I alleles

subset from HLA_v2 that only cover HLA-I alleles

In [35]:
HLA_v2_name_general_type = [item.split("*")[0] for item in HLA_v2_rname.name.tolist()]

HLA_v2_r2kp = [item in ["HLA-A", "HLA-B", "HLA-C"] for item in HLA_v2_name_general_type]

In [36]:
HLA_I_v2 = HLA_v2_rname[HLA_v2_r2kp]

HLA_I_v2.shape

(85, 2)

In [37]:
Counter([item.split("*")[0] for item in HLA_I_v2.name.tolist()])

Counter({'HLA-B': 40, 'HLA-A': 24, 'HLA-C': 21})

In [39]:
# find the corresponding pseudo sequences
HLA_pseudo.shape

(4396, 3)

In [40]:
HLA_pseudo.nunique()

name           4396
seq            1577
first_three       1
dtype: int64

In [41]:
HLA_I_v2_name_rm_star = [item.replace("*", "") for item in HLA_I_v2.name]

In [42]:
# here we verify that all the HLA_I alleles in v2 are contained in HLA_pseudo
len(set(HLA_I_v2_name_rm_star) - set(HLA_pseudo.name))

0

In [43]:
name_sort = [s for s in HLA_I_v2_name_rm_star if s in HLA_pseudo.name.tolist()]

In [44]:
HLA_pseudo_sub = HLA_pseudo.set_index('name').loc[name_sort].reset_index()

In [45]:
HLA_pseudo_sub.shape

(85, 3)

In [46]:
HLA_pseudo_sub[:10]

Unnamed: 0,name,seq,first_three
0,HLA-B08:01,YDSEYRNIFTNTDESNLYLSYNYYTWAVDAYTWY,HLA
1,HLA-A24:02,YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTGY,HLA
2,HLA-A24:03,YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTWY,HLA
3,HLA-B38:02,YYSEYRNICTNTYENTAYLRYNFYTWAVLTYTWY,HLA
4,HLA-B51:01,YYATYRNIFTNTYENIAYWTYNYYTWAELAYLWH,HLA
5,HLA-B57:01,YYAMYGENMASTYENIAYIVYDSYTWAVLAYLWY,HLA
6,HLA-A23:01,YSAMYEEKVAHTDENIAYLMFHYYTWAVLAYTGY,HLA
7,HLA-B15:17,YYAMYRENMASTYENIAYLRYHDYTWAELAYLWY,HLA
8,HLA-B50:01,YHTKYREISTNTYESNLYWRYNLYTWAELAYLWY,HLA
9,HLA-B07:05,YYSEYRNIYAQTDESNLYLSYNYYTWAERAYEWY,HLA


In [47]:
Counter(HLA_I_v2_name_rm_star == HLA_pseudo_sub.name)

Counter({True: 85})

In [48]:
# write out the pseudo sequence for the second
HLA_pseudo_sub.drop(columns=['first_three']).to_csv("../../data/intermediate_data/HLA_I_v2_pseudo_sub.csv", index=False)

In [49]:
# check whether rows with names suspected to be the same will have the same seq content

HLA_pseudo_name_set = set(HLA_pseudo.name)

flag_list = []
for item in HLA_pseudo_sub.name:
    var_item = item.replace(":", "")
    if var_item in HLA_pseudo_name_set:
        seq1 = HLA_pseudo_sub.loc[HLA_pseudo_sub['name'] == item].seq.values[0]
        seq2 = HLA_pseudo.loc[HLA_pseudo['name'] == var_item].seq.values[0]
        flag = (seq1 == seq2)
        flag_list.append(flag)

Counter(flag_list)

Counter({True: 85})

In [None]:
# the result above shows that the HLA_pseudo has duplicates with two formats for name
# and the same content in seq