### This file extends the pseudo sequences for HLA I alleles

A summary is put in file 

    t4_summary.md

We have pseudo sequences for HLA alleles on 34 positions for 85 HLA-I alleles 

    ../../data/intermediate_data/HLA_I_v2_pseudo_sub.csv 

We want to extend the positions to also cover those additional positions got from 

    t2_check_additional_pos_contacts.log.ipynb. 

First, we need to check whether we can get an unique pseudo sequence for each HLA on the additional positions. 

If that is true, secondly, we will keep those positions with diversity in amino acids. 

In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from collections import defaultdict


 Materials:
 
 Full sequence file: 
 
     ../../data/intermediate_data/HLA_TCR_contact/ClassI_prot.alfas
 
 34 contact positions (1-indexed) from NetMHCPan:
 
 7, 9, 24, 45, 59, 62, 63, 66, 67, 69, 70, 73, 74, 76, 77, 80, 81, 84, 95, 97, 99, 114, 116, 118, 143, 147, 150, 152, 156, 158, 159, 163, 167, 171
 
 17 additional positions (0-indexed) from t2_check_additional_pos_contacts.log.ipynb:
 
 4, 57, 64, 67, 71, 74, 122, 145, 148, 150, 153, 154, 156, 160, 161, 165, 169
 
 pseudo sequences for the 34 positions: 
 
    ../../data/intermediate_data/HLA_I_v2_pseudo_sub.csv
    
 Output file:

A csv file storing the combined extended pseudo sequences for 135 HLA-II pairs: 

    ../../data/for_encoders/HLA_I_v2_pseudo_40.csv

In [2]:
thirty_four = [7, 9, 24, 45, 59, 62, 63, 66, 67, 69, 70, 73, 74, 76, 77, 80, 81, 84, 95, 97, 99, 114, 116, 118, 143, 147, 150, 152, 156, 158, 159, 163, 167, 171]
HLA_I_34 = [ind-1 for ind in thirty_four]

In [3]:
# convert the info in "../../data/intermediate_data/HLA_TCR_contact/ClassI_prot.alfas" into a dictionary 
ClassI_prot = pd.read_csv("../../data/intermediate_data/HLA_TCR_contact/ClassI_prot.alfas", 
                          sep=' ', header= None)
name_ind = list(range(11761))
names = ClassI_prot.loc[[2 * ind for ind in name_ind]]
names = names.iloc[:, 0].tolist()
names = [name.replace(">", "") for name in names]
seqs = ClassI_prot.loc[[2 * ind + 1 for ind in name_ind]]
seqs = seqs.iloc[:, 0].tolist()
class_I_seqs = pd.DataFrame(list(zip(names, seqs)), 
                            columns =['name', 'seq']) 
class_I_seqs.shape
# (11761, 2)
class_I_seqs.nunique()
# name    11761
# seq      8201
# dtype: int64
class_I_seqs[:6]

short_name = [ ":".join(name.split(":")[:2]) for name in names]
class_I_seqs["short_name"] = short_name
class_I_seqs.nunique()

seq_dict_I = defaultdict(set)

for short_name, seq in zip(class_I_seqs.short_name, class_I_seqs.seq):
    seq_dict_I[short_name].add(seq)
len(seq_dict_I)

8671

For each HLA-I allele ("short_name", up to specific HLA protein level, cutoff after the second colon) in v2: 

first, check whether the corresponding pseudo sequences consisting of amino acids on the 34 positions are all the same, and if the same, whether it matches the pseudo sequence in file 

    ../../data/intermediate_data/HLA_I_v2_pseudo_sub.csv

second, check whether the corresponding pseudo sequences consisting of amino acids on the additional 17 positions are all the same. If so, how many of these 17 positions have diversity among the 85 HLA-I alleles;

Lastly, pick out those with diversity, combine them with the original 34 positions and construct a pseudo sequence on these positions. 

In [4]:
# load HLA_I_v2_pseudo_sub.csv to get the 34 positions
HLA_I_pseudo_34 = pd.read_csv("../../data/intermediate_data/HLA_I_v2_pseudo_sub.csv", sep=',', header= 0)
HLA_I_pseudo_34_dict = defaultdict(str)
for name, seq in zip(HLA_I_pseudo_34.name.tolist(), HLA_I_pseudo_34.seq.tolist()):
    name = name[4:]
    HLA_I_pseudo_34_dict[name] = seq

In [5]:
# let's start from verifying whether for each of the 85 HLA-I alleles, the pseudo sequences
# on the original 34 positions are the same

pseudo_seqs_ori_HLA_I_dict = defaultdict(str)

for name in HLA_I_pseudo_34_dict:
    star_name = name[0] + "*" + name[1:]
    list_seq = list(seq_dict_I[star_name])
    if len(list_seq) == 1:
        seq = list_seq[0]
        pseudo_seqs_ori_HLA_I_dict[name] = \
                    ''.join([seq[i] for i in HLA_I_34])
    else:
        set_pseudo = set()
        for seq in list_seq:
            cur_pseudo = ''.join([seq[i] for i in HLA_I_34])
            set_pseudo.add(cur_pseudo)
        if len(set_pseudo) == 1:
            pseudo_seqs_ori_HLA_I_dict[name] = \
                    ''.join([seq[i] for i in HLA_I_34])
        else:
            print("pseudo seqs don't match")
            print(name)
            break

In [6]:
len(pseudo_seqs_ori_HLA_I_dict)

85

In [7]:
# now check whether these pseudo sequences match those from 
# "../../data/intermediate_data/HLA_I_v2_pseudo_sub.csv"
# they do match
HLA_I_pseudo_34_dict == pseudo_seqs_ori_HLA_I_dict

True

In [8]:
add_HLA_I = [4, 57, 64, 67, 71, 74, 122, 145, 148, 150, 153, 154, 156, 160, 161, 165, 169]

In [10]:
# this cell verifies that for each of the 85 HLA-I alleles, the pseudo sequences
# on these additional 17 positions are the same
nunique_seqs = []
pseudo_seqs_add_HLA_I_dict = defaultdict(str)

for name in HLA_I_pseudo_34_dict:
    star_name = name[0] + "*" + name[1:]
    list_seq = list(seq_dict_I[star_name])
    nunique_seqs += [len(list_seq)]
    if len(list_seq) == 1:
        seq = list_seq[0]
        pseudo_seqs_add_HLA_I_dict[name] = \
                    ''.join([seq[i] for i in add_HLA_I])
    else:
        set_pseudo = set()
        for seq in list_seq:
            cur_pseudo = ''.join([seq[i] for i in add_HLA_I])
            set_pseudo.add(cur_pseudo)
        if len(set_pseudo) == 1:
            pseudo_seqs_add_HLA_I_dict[name] = \
                    ''.join([seq[i] for i in add_HLA_I])
        else:
            print("pseudo seqs don't match")
            print(name)
            break

In [11]:
# this cell counts how many unique amino acids each of the 17 positions has among the 85 alleles
nunique_17 = []

pseudo_seqs_add_HLA_I_values = list(pseudo_seqs_add_HLA_I_dict.values())
for i in range(17):
    aa_list = []
    for value in pseudo_seqs_add_HLA_I_values:
        aa_list += [value[i]]
    nunique_17 += [len(set(aa_list))]

In [12]:
# pick out those with number of unique amino acids greater than 1
diverse_add_HLA_I = [add_HLA_I[i] for i in range(17) if nunique_17[i] > 1]
diverse_add_HLA_I

[64, 148, 150, 160, 165, 169]

In [14]:
# now we have the 40 positions together for HLA-I alleles
extended_HLA_I_40 = HLA_I_34 + diverse_add_HLA_I
extended_HLA_I_40.sort()
#extended_HLA_I_40

In [15]:
# move on to get the pseudo sequence of amino acids on these 40 positions out

HLA_I_name_85 = HLA_I_pseudo_34.name.tolist()

pseudo_seqs_HLA_I_40_dict = defaultdict(str)


for name in HLA_I_name_85:
    star_name = name[4] + "*" + name[5:]
    if star_name not in seq_dict_I:
        print(False)
        print(star_name)
        break
    list_seq = list(seq_dict_I[star_name])
    if len(list_seq) == 1:
        seq = list_seq[0]
        pseudo_seqs_HLA_I_40_dict[name] = \
            ''.join([seq[i] for i in extended_HLA_I_40])
    else:
        set_pseudo = set()
        for seq in list_seq:
            cur_pseudo = ''.join([seq[i] for i in extended_HLA_I_40])
            set_pseudo.add(cur_pseudo)
        if len(set_pseudo) == 1:
            pseudo_seqs_HLA_I_40_dict[name] = ''.join([seq[i] for i in extended_HLA_I_40])
        else:
            print("pseudo seqs based on 40 positions don't match")
            print(name)
            break

In [94]:
#pseudo_seqs_HLA_I_40_dict

In [16]:
pseudo_seqs_HLA_I_40_value = [pseudo_seqs_HLA_I_40_dict[key] for key in HLA_I_name_85]

In [27]:
HLA_I_name_85_star = [name[:5]+"*"+name[5:] for name in HLA_I_name_85]

HLA_I_v2_pseudo_sub_40 = \
     pd.DataFrame(list(zip(HLA_I_name_85_star, pseudo_seqs_HLA_I_40_value)), \
                  columns = ['hla', "seq"])

HLA_I_v2_pseudo_sub_40.to_csv("../../data/for_encoders/HLA_I_pseudo_40.csv", index = False)

In [30]:
HLA_I_v2_pseudo_sub_40[:6]

Unnamed: 0,hla,seq
0,HLA-B*08:01,YDSEYRNQIFTNTDESNLYLSYNYYTWAARVDAYETEWRY
1,HLA-A*24:02,YSAMYEEGKVAHTDENIAYLMFHYYTWAAHVQAYETDGRY
2,HLA-A*24:03,YSAMYEEGKVAHTDENIAYLMFHYYTWAAHVQAYETEWRY
3,HLA-B*38:02,YYSEYRNQICTNTYENTAYLRYNFYTWAARVLTYETEWRY
4,HLA-B*51:01,YYATYRNQIFTNTYENIAYWTYNYYTWAARELAYELEWRH
5,HLA-B*57:01,YYAMYGERNMASTYENIAYIVYDSYTWAARVLAYELEWRY


In [20]:
all_aa = ""
for seq in HLA_I_v2_pseudo_sub_40.seq.tolist():
    all_aa += seq

len(Counter(all_aa))

19