This file explores the additional possible contact positions provided by 

    ../../data/intermediate_data/HLA_TCR_contact/contacts.log

A summary of findings is put in file 

    t2_summary.md

TCR contacts and peptide contacts will be treated separately. 

HLA-I and HLA-II contacts will be treated separately.

Alpha and beta chain for HLA-II contacts will be treated separately as well. 

In [1]:
import numpy as np
import pandas as pd

import os

from collections import Counter
from collections import defaultdict

In [3]:
contacts = pd.read_csv("../../data/intermediate_data/HLA_TCR_contact/contacts.log", sep=' ', header= None)
contacts.shape

(5475, 9)

In [4]:
contacts[:6]

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,tcr_contact_pos:,A,D,A*02,57,E,DEGQ,E,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
1,tcr_contact_pos:,A,D,A*02,64,R,GPR,R,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
2,tcr_contact_pos:,A,D,A*02,65,K,KNQ,K,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
3,tcr_contact_pos:,A,D,A*02,67,K,EKR,K,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
4,tcr_contact_pos:,A,D,A*02,68,A,ADSTV,A,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
5,tcr_contact_pos:,A,E,A*02,68,A,ADSTV,A,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....


In [5]:
contacts.columns = ["tag", "mhc_chain", "other_chain", "allele", 
                   "contact_column", "seq_at", "seq_pos", "seq_emerson", 
                   "pdb_file"]
contacts[:6]

Unnamed: 0,tag,mhc_chain,other_chain,allele,contact_column,seq_at,seq_pos,seq_emerson,pdb_file
0,tcr_contact_pos:,A,D,A*02,57,E,DEGQ,E,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
1,tcr_contact_pos:,A,D,A*02,64,R,GPR,R,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
2,tcr_contact_pos:,A,D,A*02,65,K,KNQ,K,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
3,tcr_contact_pos:,A,D,A*02,67,K,EKR,K,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
4,tcr_contact_pos:,A,D,A*02,68,A,ADSTV,A,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....
5,tcr_contact_pos:,A,E,A*02,68,A,ADSTV,A,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....


In [6]:
contacts.nunique()

tag                 2
mhc_chain           2
other_chain         3
allele             23
contact_column     95
seq_at             20
seq_pos           263
seq_emerson        43
pdb_file          109
dtype: int64

In [7]:
contacts.allele.unique()

array(['A*02', 'DRA*01', 'DRB1*01', 'DRB1*04', 'B*08', 'DRB1*15',
       'DRB5*01', 'B*35', 'B*57', 'B*44', 'DQA1*01', 'DQB1*05', 'A*24',
       'B*27', 'DQA1*03', 'DQB1*03', 'DRB3*03', 'B*51', 'DQA1*05',
       'DQB1*02', 'DPA1*01', 'DPB1*352', 'A*01'], dtype=object)

In [11]:
#Counter(contacts.allele)

In [8]:
contacts.contact_column.nunique()

95

In [9]:
Counter(contacts.tag)

Counter({'tcr_contact_pos:': 2014, 'pep_contact_pos:': 3461})

In [11]:
contacts['short_name'] = [item.split("*")[0] for item in contacts.allele.tolist()]

In [12]:
Counter(contacts.short_name)

Counter({'A': 2946,
         'DRA': 330,
         'DRB1': 250,
         'B': 1071,
         'DRB5': 27,
         'DQA1': 371,
         'DQB1': 396,
         'DRB3': 29,
         'DPA1': 23,
         'DPB1': 32})

Separate by tcr or peptide contact

In [22]:
tcr_contacts = contacts.loc[contacts['tag'] == 'tcr_contact_pos:']
tcr_contacts.shape

(2014, 10)

In [23]:
tcr_contacts.nunique()

tag                 1
mhc_chain           2
other_chain         2
allele             23
contact_column     53
seq_at             18
seq_pos           143
seq_emerson        23
pdb_file          109
short_name         10
dtype: int64

In [24]:
Counter(tcr_contacts.short_name)

Counter({'A': 1130,
         'B': 384,
         'DPA1': 4,
         'DPB1': 11,
         'DQA1': 132,
         'DQB1': 123,
         'DRA': 116,
         'DRB1': 91,
         'DRB3': 10,
         'DRB5': 13})

In [25]:
#Counter(tcr_contacts.contact_column)

In [19]:
#tcr_contacts[:6]

In [26]:
pep_contacts = contacts.loc[contacts['tag'] == 'pep_contact_pos:']

In [27]:
pep_contacts.shape

(3461, 10)

In [28]:
pep_contacts[:6]

Unnamed: 0,tag,mhc_chain,other_chain,allele,contact_column,seq_at,seq_pos,seq_emerson,pdb_file,short_name
21,pep_contact_pos:,A,C,A*02,4,M,LMV,M,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....,A
22,pep_contact_pos:,A,C,A*02,6,Y,CHY,Y,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....,A
23,pep_contact_pos:,A,C,A*02,8,F,FSTY,FY,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....,A
24,pep_contact_pos:,A,C,A*02,44,M,MV,M,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....,A
25,pep_contact_pos:,A,C,A*02,58,Y,Y,Y,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....,A
26,pep_contact_pos:,A,C,A*02,62,E,EN,E,/home/pbradley/tcr_scripts/pdb_files/1ao7.pdb....,A


In [29]:
pep_contacts.nunique()

tag                 1
mhc_chain           2
other_chain         1
allele             23
contact_column     84
seq_at             20
seq_pos           228
seq_emerson        43
pdb_file          109
short_name         10
dtype: int64

In [30]:
Counter(pep_contacts.short_name)

Counter({'A': 1816,
         'B': 687,
         'DPA1': 19,
         'DPB1': 21,
         'DQA1': 239,
         'DQB1': 273,
         'DRA': 214,
         'DRB1': 159,
         'DRB3': 19,
         'DRB5': 14})

In [31]:
#Counter(pep_contacts.contact_column)

In [32]:
#pep_contacts.contact_column.unique()

In [33]:
#pep_contacts.contact_column.nunique()

In [34]:
#pep_contacts.contact_column.unique() + 1

Separate by HLA-I or HLA-II alleles

Then the parts related to HLA-II alleles are separated into alpha/beta chains. 

Resulting in the following six data frames:


tcr_HLA_I_contacts

tcr_HLA_II_contacts_alpha

tcr_HLA_II_contacts_beta 


pep_HLA_I_contacts

pep_HLA_II_contacts_alpha

pep_HLA_II_contacts_beta 

In [35]:
HLA_I_short_name_list  = ['A', 'B']
HLA_II_short_name_list = ['DRA','DRB1','DRB3', 'DRB5', 'DPA1','DPB1','DQA1','DQB1']

In [37]:
tcr_HLA_I_contacts = tcr_contacts[tcr_contacts['short_name'].isin(HLA_I_short_name_list)]

In [46]:
#Counter(tcr_HLA_I_contacts.contact_column)

In [47]:
tcr_HLA_II_contacts = tcr_contacts[tcr_contacts['short_name'].isin(HLA_II_short_name_list)]

In [51]:
Counter(tcr_HLA_II_contacts.mhc_chain)

Counter({'A': 252, 'B': 248})

In [52]:
tcr_HLA_II_contacts_alpha = tcr_HLA_II_contacts[tcr_HLA_II_contacts['mhc_chain'] == 'A']

In [56]:
#Counter(tcr_HLA_II_contacts_alpha.contact_column)

In [57]:
tcr_HLA_II_contacts_beta = tcr_HLA_II_contacts[tcr_HLA_II_contacts['mhc_chain'] == 'B']

In [59]:
Counter(tcr_HLA_II_contacts_beta.short_name)

Counter({'DPB1': 11, 'DQB1': 123, 'DRB1': 91, 'DRB3': 10, 'DRB5': 13})

In [60]:
pep_HLA_I_contacts = pep_contacts[pep_contacts['short_name'].isin(HLA_I_short_name_list)]

In [66]:
Counter(pep_HLA_I_contacts.mhc_chain)

Counter({'A': 2503})

In [67]:
pep_HLA_II_contacts = pep_contacts[pep_contacts['short_name'].isin(HLA_II_short_name_list)]

In [68]:
pep_HLA_II_contacts.shape

(958, 10)

In [70]:
pep_HLA_II_contacts_alpha = pep_HLA_II_contacts[pep_HLA_II_contacts['mhc_chain'] == 'A']

In [71]:
pep_HLA_II_contacts_beta = pep_HLA_II_contacts[pep_HLA_II_contacts['mhc_chain'] == 'B']

In [75]:
Counter(pep_HLA_II_contacts_alpha.short_name)

Counter({'DPA1': 19, 'DQA1': 239, 'DRA': 214})

In [76]:
Counter(pep_HLA_II_contacts_beta.short_name)

Counter({'DPB1': 21, 'DQB1': 273, 'DRB1': 159, 'DRB3': 19, 'DRB5': 14})

Now move on to check the counts of different positions. 

Eventually, we will need to check positions for three components:
    
HLA-I alleles

HLA-II alpha chain

HLA-II beta chain

In [None]:
'''
tcr_HLA_I_contacts

tcr_HLA_II_contacts_alpha

tcr_HLA_II_contacts_beta

pep_HLA_I_contacts

pep_HLA_II_contacts_alpha

pep_HLA_II_contacts_beta
'''

First, look into contact positions related to HLA_I.

In [84]:
thirty_four = [7, 9, 24, 45, 59, 62, 63, 66, 67, 69, 70,
              73, 74, 76, 77, 80, 81, 84, 95, 97, 99, 114, 
              116, 118, 143, 147, 150, 152, 156, 158, 159, 
              163, 167, 171]
HLA_I_34 = [ind-1 for ind in thirty_four]

In [82]:
tcr_HLA_I_contacts_counter = Counter(tcr_HLA_I_contacts.contact_column.tolist())

tcr_HLA_I_contacts_key_list = list(tcr_HLA_I_contacts_counter.keys())
tcr_HLA_I_contacts_key_list.sort()
tcr_HLA_I_contacts_key_list

tcr_HLA_I_contacts_counts = [tcr_HLA_I_contacts_counter[key] for key in tcr_HLA_I_contacts_key_list]
df_tcr_HLA_I_contacts = pd.DataFrame(list(zip(tcr_HLA_I_contacts_key_list, tcr_HLA_I_contacts_counts)),\
                                     columns = ['pos', 'count'])
# 0 indexed positions
df_tcr_HLA_I_contacts.to_csv("../../data/intermediate_data/t2_tcr_HLA_I_contacts.csv", index = False)

df_tcr_HLA_I_contacts.shape

In [136]:
#df_tcr_HLA_I_contacts

In [274]:
sub_tcr_HLA_I_contacts = df_tcr_HLA_I_contacts[df_tcr_HLA_I_contacts['count']>= 10]

In [275]:
set(sub_tcr_HLA_I_contacts.pos) - set(HLA_I_34)

{57, 64, 67, 71, 74, 145, 148, 150, 153, 154, 156, 160, 161, 165, 169}

In [276]:
len(set(HLA_I_34) - set(sub_tcr_HLA_I_contacts.pos))

21

In [98]:
pep_HLA_I_contacts_counter = Counter(pep_HLA_I_contacts.contact_column.tolist())

pep_HLA_I_contacts_key_list = list(pep_HLA_I_contacts_counter.keys())
pep_HLA_I_contacts_key_list.sort()
#pep_HLA_I_contacts_key_list

pep_HLA_I_contacts_counts = [pep_HLA_I_contacts_counter[key] for key in pep_HLA_I_contacts_key_list]
df_pep_HLA_I_contacts = pd.DataFrame(list(zip(pep_HLA_I_contacts_key_list, pep_HLA_I_contacts_counts)),\
                                     columns = ['pos', 'count'])
# 0-indexed positions
df_pep_HLA_I_contacts.to_csv("../../data/intermediate_data/t2_pep_HLA_I_contacts.csv", index = False)
df_pep_HLA_I_contacts.shape

#Counter(df_pep_HLA_I_contacts['count'].tolist())

In [277]:
sub_pep_HLA_I_contacts = df_pep_HLA_I_contacts[df_pep_HLA_I_contacts['count']>= 10]

In [278]:
sub_pep_HLA_I_contacts.shape

(37, 2)

In [279]:
set(sub_pep_HLA_I_contacts.pos) - set(HLA_I_34)

{4, 71, 122, 145, 154}

In [286]:
set(HLA_I_34) - set(sub_pep_HLA_I_contacts.pos)

{117, 157}

In [281]:
add_HLA_I = list(set(sub_tcr_HLA_I_contacts.pos.tolist()).union(set(sub_pep_HLA_I_contacts.pos.tolist())) - set(HLA_I_34))
add_HLA_I.sort()

In [288]:
add_HLA_I

[4, 57, 64, 67, 71, 74, 122, 145, 148, 150, 153, 154, 156, 160, 161, 165, 169]

Second, look into positions related to HLA_II alpha chain.

In [160]:
fifteen = [9, 11, 22, 24, 31, 52, 53, 58, 59, 61, 65, 66, 68, 72, 73]
HLA_II_alpha = [ind-1 for ind in fifteen]

In [161]:
HLA_II_alpha

[8, 10, 21, 23, 30, 51, 52, 57, 58, 60, 64, 65, 67, 71, 72]

In [162]:
tcr_HLA_II_contacts_alpha_counter = Counter(tcr_HLA_II_contacts_alpha.contact_column.tolist())

In [163]:
tcr_HLA_II_contacts_alpha_counter

Counter({35: 10,
         49: 2,
         50: 1,
         51: 29,
         53: 38,
         54: 34,
         55: 3,
         56: 16,
         57: 37,
         58: 14,
         60: 20,
         61: 23,
         63: 7,
         64: 16,
         67: 2})

In [165]:
tcr_HLA_II_contacts_alpha_key_list = list(tcr_HLA_II_contacts_alpha_counter.keys())
tcr_HLA_II_contacts_alpha_key_list.sort()
tcr_HLA_II_contacts_alpha_key_list

[35, 49, 50, 51, 53, 54, 55, 56, 57, 58, 60, 61, 63, 64, 67]

In [166]:
tcr_HLA_II_contacts_alpha_counts = [tcr_HLA_II_contacts_alpha_counter[key] for key in tcr_HLA_II_contacts_alpha_key_list]
df_tcr_HLA_II_contacts_alpha = pd.DataFrame(list(zip(tcr_HLA_II_contacts_alpha_key_list, tcr_HLA_II_contacts_alpha_counts)),\
                                     columns = ['pos', 'count'])
# 0 indexed positions
df_tcr_HLA_II_contacts_alpha.to_csv("../../data/intermediate_data/t2_tcr_HLA_II_contacts_alpha.csv", index = False)

In [177]:
df_tcr_HLA_II_contacts_alpha.shape

(15, 2)

In [169]:
pep_HLA_II_contacts_alpha_counter = Counter(pep_HLA_II_contacts_alpha.contact_column.tolist())

In [171]:
#pep_HLA_II_contacts_alpha_counter

In [173]:
pep_HLA_II_contacts_alpha_key_list = list(pep_HLA_II_contacts_alpha_counter.keys())
pep_HLA_II_contacts_alpha_key_list.sort()
#pep_HLA_II_contacts_alpha_key_list

In [174]:
pep_HLA_II_contacts_alpha_counts = [pep_HLA_II_contacts_alpha_counter[key] for key in pep_HLA_II_contacts_alpha_key_list]
df_pep_HLA_II_contacts_alpha = pd.DataFrame(list(zip(pep_HLA_II_contacts_alpha_key_list, pep_HLA_II_contacts_alpha_counts)),\
                                     columns = ['pos', 'count'])
# 0 indexed positions
df_pep_HLA_II_contacts_alpha.to_csv("../../data/intermediate_data/t2_pep_HLA_II_contacts_alpha.csv", index = False)

In [176]:
df_pep_HLA_II_contacts_alpha.shape

(30, 2)

In [233]:
modify_15 = [ind - 4 for ind in fifteen]
modify_15

[5, 7, 18, 20, 27, 48, 49, 54, 55, 57, 61, 62, 64, 68, 69]

In [234]:
[tcr_HLA_II_contacts_alpha_counter[key] for key in modify_15]

[0, 0, 0, 0, 0, 0, 2, 34, 3, 37, 23, 0, 16, 0, 0]

In [232]:
set(modify_15) - set(df_pep_HLA_II_contacts_alpha.pos)

set()

In [220]:
[pep_HLA_II_contacts_alpha_counter[key] for key in modify_15]

[3, 10, 25, 26, 10, 26, 26, 23, 6, 5, 26, 25, 19, 26, 19]

In [231]:
# keep all contact positions with counts >= 3

sub_tcr_HLA_II_contacts_alpha = df_tcr_HLA_II_contacts_alpha[df_tcr_HLA_II_contacts_alpha['count']>=3]
sub_tcr_HLA_II_contacts_alpha.shape

(12, 2)

In [229]:
sub_pep_HLA_II_contacts_alpha = df_pep_HLA_II_contacts_alpha[df_pep_HLA_II_contacts_alpha['count']>=3]
sub_pep_HLA_II_contacts_alpha.shape

(28, 2)

In [235]:
set(sub_tcr_HLA_II_contacts_alpha.pos.tolist()).union(set(sub_pep_HLA_II_contacts_alpha.pos.tolist())) - set(modify_15)

{4, 28, 35, 39, 45, 46, 47, 50, 51, 53, 56, 58, 60, 63, 65, 67, 71, 72}

In [239]:
add_HLA_II_alpha = list(set(sub_tcr_HLA_II_contacts_alpha.pos.tolist()).union(set(sub_pep_HLA_II_contacts_alpha.pos.tolist())) - set(modify_15))
add_HLA_II_alpha.sort()
add_HLA_II_alpha

[4, 28, 35, 39, 45, 46, 47, 50, 51, 53, 56, 58, 60, 63, 65, 67, 71, 72]

In the end, look into positions related to HLA_II allele beta chain.

In [183]:
nineteen = [9, 11, 13, 26, 28, 30, 47, 57, 67, 70, 71, 74, 77, 78, 81, 85, 86, 89, 90]

In [185]:
tcr_HLA_II_contacts_beta_counter = Counter(tcr_HLA_II_contacts_beta.contact_column.tolist())

In [187]:
len(tcr_HLA_II_contacts_beta_counter)

18

In [188]:
tcr_HLA_II_contacts_beta_key_list = list(tcr_HLA_II_contacts_beta_counter.keys())
tcr_HLA_II_contacts_beta_key_list.sort()
tcr_HLA_II_contacts_beta_key_list

[53, 54, 57, 59, 60, 62, 63, 64, 65, 66, 67, 69, 70, 71, 73, 74, 77, 78]

In [189]:
tcr_HLA_II_contacts_beta_counts = [tcr_HLA_II_contacts_beta_counter[key] for key in tcr_HLA_II_contacts_beta_key_list]
df_tcr_HLA_II_contacts_beta = pd.DataFrame(list(zip(tcr_HLA_II_contacts_beta_key_list, tcr_HLA_II_contacts_beta_counts)),\
                                     columns = ['pos', 'count'])
# 0 indexed positions
df_tcr_HLA_II_contacts_beta.to_csv("../../data/intermediate_data/t2_tcr_HLA_II_contacts_beta.csv", index = False)

In [191]:
#df_tcr_HLA_II_contacts_beta

In [192]:
pep_HLA_II_contacts_beta_counter = Counter(pep_HLA_II_contacts_beta.contact_column.tolist())

In [194]:
pep_HLA_II_contacts_beta_key_list = list(pep_HLA_II_contacts_beta_counter.keys())
pep_HLA_II_contacts_beta_key_list.sort()
#pep_HLA_II_contacts_beta_key_list

In [195]:
pep_HLA_II_contacts_beta_counts = [pep_HLA_II_contacts_beta_counter[key] for key in pep_HLA_II_contacts_beta_key_list]
df_pep_HLA_II_contacts_beta = pd.DataFrame(list(zip(pep_HLA_II_contacts_beta_key_list, pep_HLA_II_contacts_beta_counts)),\
                                     columns = ['pos', 'count'])
# 0 indexed positions
df_pep_HLA_II_contacts_beta.to_csv("../../data/intermediate_data/t2_pep_HLA_II_contacts_beta.csv", index = False)

In [197]:
set(df_tcr_HLA_II_contacts_beta.pos.tolist()).union(set(df_pep_HLA_II_contacts_beta.pos.tolist())) 

38

In [264]:
modify_19 = [ind - 7 for ind in nineteen]
modify_19

[2, 4, 6, 19, 21, 23, 40, 50, 60, 63, 64, 67, 70, 71, 74, 78, 79, 82, 83]

In [265]:
set(modify_19) - set(pep_HLA_II_contacts_beta_key_list)

{82, 83}

In [258]:
sub_tcr_HLA_II_contacts_beta = df_tcr_HLA_II_contacts_beta[df_tcr_HLA_II_contacts_beta['count'] >= 10]
sub_tcr_HLA_II_contacts_beta.shape

(10, 2)

In [266]:
[pep_HLA_II_contacts_beta_counter[key] for key in modify_19]

[12, 23, 26, 20, 22, 18, 13, 26, 25, 17, 22, 22, 21, 26, 26, 26, 10, 0, 0]

In [252]:
sub_pep_HLA_II_contacts_beta = df_pep_HLA_II_contacts_beta[df_pep_HLA_II_contacts_beta['count'] >=10]
sub_pep_HLA_II_contacts_beta.shape

(21, 2)

In [267]:
set(sub_tcr_HLA_II_contacts_beta.pos.tolist()).union(set(sub_pep_HLA_II_contacts_beta.pos.tolist())) - set(modify_19)

{49, 53, 54, 57, 59, 62, 66, 69, 75}

In [268]:
add_HLA_II_beta = list(set(sub_tcr_HLA_II_contacts_beta.pos.tolist()).union(set(sub_pep_HLA_II_contacts_beta.pos.tolist())) - set(modify_19))
add_HLA_II_beta.sort()
add_HLA_II_beta

[49, 53, 54, 57, 59, 62, 66, 69, 75]

In [269]:
len(add_HLA_II_beta)

9