In [1]:
# Set path to directory containing .py files
# Import .py file to extract distance between amino acids, contacts and attentions
# All necessary packages are imported within the .py file

import sys
sys.path.append("../py")

# ESM-2 model for protein embeddings is esm.pretrained.esm2_t33_650M_UR50D()
from extract_contact_aa_distance_attentions import * 


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_after): LayerNorm((1280,), eps=1

In [2]:
# Ignore warnings when importing PDB structure files
warnings.simplefilter('ignore', PDBConstructionWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Set variable for parsing PDB structural files
parser = PDBParser()  

In [3]:
def download_pdb(pdb_id, structure_dir, downloadurl="http://files.rcsb.org/download/"):
    """
    Downloads a PDB file from the Internet and saves it in a data directory.
    :param pdbcode: The standard PDB ID e.g. '3ICB' or '3icb'
    :param datadir: The directory where the downloaded file will be saved
    :param downloadurl: The base PDB download URL, cf.
        `https://www.rcsb.org/pages/download/http#structures` for details
        Note that the unencrypted HTTP protocol is used by default
        to avoid spurious OpenSSL errors...
    :return: the full path to the downloaded PDB file or None if something went wrong
    """
    pdbfn = pdb_id + ".pdb"
    url = downloadurl + pdbfn
    outfnm = os.path.join(structure_dir, pdbfn)
    try:
        urllib.request.urlretrieve(url, outfnm)
        return pdbfn
    except Exception as err:
        # all sorts of things could have gone wrong...
        print(str(err), file=sys.stderr)
        return None


def download_fasta(pdb_id, fasta_dir, downloadurl="https://www.rcsb.org/fasta/entry/"):
    pdbfn = pdb_id
    url = downloadurl + pdbfn
    outfnm = os.path.join(fasta_dir, F'{pdbfn}.fasta')
    try:
        urllib.request.urlretrieve(url, outfnm)
        return pdbfn
    except Exception as err:
        # all sorts of things could have gone wrong...
        print(str(err), file=sys.stderr)
        return None

In [26]:
# download_pdb('1R4Q', structure_dir, downloadurl="http://files.rcsb.org/download/")
# download_fasta('1R4Q', fasta_dir, downloadurl="https://www.rcsb.org/fasta/entry/")
download_pdb('1DM0', structure_dir, downloadurl="http://files.rcsb.org/download/")
download_fasta('1DM0', fasta_dir, downloadurl="https://www.rcsb.org/fasta/entry/")

'1DM0'

In [29]:
(protein_data['1DM0'])

'KEFTLDFSTAKTYVDSLNVIRSAIGTPLQTISSGGTSLLMIDSGTGDNLFAVDVRGIDPEEGRFNNLRLIVERNNLYVTGFVNRTNNVFYRFADFSHVTFPGTTAVTLSGDSSYTTLQRVAGISRTGMQINRHSLTTSYLDLMSHSGTSLTQSVARAMLRFVTVTAEALRFRQIQRGFRTTLDDLSGRSYVMTAEDVDLTLNWGRLSSVLPDYHGQDSVRVGRISFGSINAILGSVALILNCHHHASRVARMASDEFPSMCPADGRVRGITHNKILWDSSTLGAILM'

In [9]:
## Make sure to update directory paths from .py file prior to executing code

# Parse CASP training_95 file, extract all sequence IDs and sequences from dataset
# CASP training_95 file contains seed sequences for sequence clusters at 95% similarity

prot_data_dict = parse_casp7_file(casp_95)

for casp_id, casp_seq in list(prot_data_dict.items())[:3]:
    print(casp_id)
    print(casp_seq)
    print()

1RSO_2_B
GLLAAERAVSQVLDSLEEIHALTDSSEKDLDFLHSVFQDQHLHTLLDLYDKINTKS

3PCG_1_A
PIELLPETPSQTAGPYVHIGLALEAAGNPTRDQEIWNRLAKPDAPGEHILLLGQVYDGNGHLVRDSFLEVWQADANGEYQDAYNLENAFNSFGRTATTFDAGEWTLHTVKPGVVNNAAGVPMAPHINISLFARGINIHLHTRLYFDDEAQANAKCPVLNLIEQPQRRETLIAKRCEVDGKTAYRFDIRIQGEGETVFFDF

1YBD_1_A
MTQQIKYKRVLLKLSGESLMGSDPFGINHDTIVQTVGEIAEVVKMGVQVGIVVGGGNIFRGVSAQAGSMDRATADYMGMMATVMNALALKDAFETLGIKARVQSALSMQQIAETYARPKAIQYLEEGKVVIFAAGTGNPFFTTDTAAALRGAEMNCDVMLKATNVDGVYTADPKKDPSATRYETITFDEALLKNLKVMDATAFALCRERKLNIVVFGIAKEGSLKRVITGEDEGTLVHC



In [10]:
# CASP datasets contains sequence IDs that correspond to different chains in a sequence
# I.E. 1RSO_2_B refers to the 2nd part of chain B of the protein 1RSO

## To be sure I extract the CASP sequence that corresponds to the sequence in the 
# PDB structural file, I extract CASP IDs that occur once
# Which means that the stored sequence is the entire sequence for the protein
# This is because some sequences in the dataset only have a label for 1 chain

# Calculate occurence of sequence IDs
protein_id_counts = Counter(protein_id.split('_')[0] for protein_id in prot_data_dict)

# Store IDs that only occur once as single_occurence_ids
single_occurence_ids = [protein_id for protein_id, count in protein_id_counts.items() if count == 1]

print('Number of Unique IDs: ', len(single_occurence_ids), '\n')
print(single_occurence_ids[:5])

Number of Unique IDs:  8494 

['1YBD', '1PSM', '1DM0', '1WFO', '1YOA']


In [11]:
## Take the IDs that occur once in the CASP training_95 dataset and extract the sequences
# from the PDB database. These sequences are used in generating PDB structural data.
# I.E. calculating arnstrong distance.

# for pdb_id in single_occurence_ids:
#     download_fasta(pdb_id, fasta_dir, downloadurl="https://www.rcsb.org/fasta/entry/")

# Load sequence ID and sequences from PDB into a dictionary
protein_data = load_fastas(fasta_dir)

print('Number of Sequences: ', len(protein_data), '\n')

for pdb_id, pdb_seq in list(protein_data.items())[:3]:
    print(pdb_id)
    print(pdb_seq)
    print()

Number of Sequences:  2683 

1C4R
GHAGTTYIFSKGGGQITYKWPPNDRPSTRADRLAIGFSTVQKEAVLVRVDSSSGLGDYLELHIHQGKIGVKFNVGTDDIAIEESNAIINDGKYHVVRFTRSGGNATLQVDSWPVIERYPAGRQLTIFNSQATIIIGGKEQGQPFQGQLSGLYYNGLKVLNMAAENDANIAIVGNVRLVGEVP

1UE8
MYDWFKQMRKESPVYYDGKVWNLFKYEDCKMVLNDHKRFSSNLTGYNDKLEMLRSGKVFFDIPTRYTMLTSDPPLHDELRNLTADAFNPSNLPVDFVREVTVKLLSELDEEFDVIESFAIPLPILVISKMLGINPDVKKVKDWSDLVALRLGRADEIFSIGRKYLELISFSKKELDSRKGKEIVDLTGKIANSNLSELEKEGYFILLMIAGNETTTNLIGNAIEDFTLYNSWDYVREKGALKAVEEALRFSPPVMRTIRVTKEKVKIRDQVIDEGELVRVWIASANRDEEVFKDPDSFIPDRTPNPHLSFGSGIHLCLGAPLARLEARIALEEFAKKFRVKEIVKKEKIDNEVLNGYRKLVVRVERT

1I24
MRGSHHHHHHGSRVMVIGGDGYCGWATALHLSKKNYEVCIVDNLVRRLFDHQLGLESLTPIASIHDRISRWKALTGKSIELYVGDICDFEFLAESFKSFEPDSVVHFGEQRSAPYSMIDRSRAVYTQHNNVIGTLNVLFAIKEFGEECHLVKLGTMGEYGTPNIDIEEGYITITHNGRTDTLPYPKQASSFYHLSKVHDSHNIAFTCKAWGIRATDLNQGVVYGVKTDETEMHEELRNRLDYDAVFGTALNRFCVQAAVGHPLTVYGKGGQTRGYLDIRDTVQCVEIAIANPAKAGEFRVFNQFTEQFSVNELASLVTKAGSKLGLDVKKMTVPNPRVEAEEHYYNAKHTKLMELGLEPHYLSDSLLDSLLNFAVQFKDRVDTKQIMPSVSWKKIGVKTKSMT

In [14]:
# Store sequence IDs that have sequences that are the same in both PDB database and
# CASP training_95 dataset
# Double checking if the sequences are uniform is important to ensure we are comparing
# the same proteins and chains that are the seed sequences in the CASP dataset clusters

same_sequence_ids = check_casp_pdb_seqs(protein_data)

print('Number of Uniform Sequences: ', len(same_sequence_ids), '\n')

Number of Uniform Sequences:  1215 



In [16]:
same_sequence_ids_2 = same_sequence_ids + ['1R4Q']

In [24]:
'1DM0' in same_sequence_ids_2

False

In [25]:
structure_dir

'/Users/williamharrigan/Desktop/Github/contact_site_classifier/attention_classifier/data_files/structure_files/'

In [21]:
len(same_sequence_ids_2)

1216

In [17]:
## To make sure we have a balanced number of contacts and non-contacts for each protein
# We make a subset of non-contact sites that are randomly selected and
# has the same number of amino acid pairs as in-contact sites

in_contact_sites, non_contact_sites, subset_non_contact_sites = contacts_per_pdb(same_sequence_ids_2,protein_data)

# Assuming in_contact_sites and subset_non_contact_sites are dictionaries
for sequence_id_in_contact, sequence_id_non_contact in zip(list(in_contact_sites.keys())[:1], list(subset_non_contact_sites.keys())[:1]):
    # If you want to process only 'in_contact_sites'
    for instance in random.sample(in_contact_sites[sequence_id_in_contact], 3):
        print(instance)

for instance in random.sample(subset_non_contact_sites[sequence_id_non_contact], 3):
    print(instance)


{'res_1': 277, 'res_2': 260, 'sig_1': 'L', 'sig_2': 'V', 'aa_dist': 17, 'arn_dist': 4.6384096, 'in_contact': True}
{'res_1': 273, 'res_2': 264, 'sig_1': 'D', 'sig_2': 'K', 'aa_dist': 9, 'arn_dist': 4.175746, 'in_contact': True}
{'res_1': 271, 'res_2': 266, 'sig_1': 'V', 'sig_2': 'K', 'aa_dist': 5, 'arn_dist': 4.15248, 'in_contact': True}
{'res_1': 140, 'res_2': 127, 'sig_1': 'V', 'sig_2': 'I', 'aa_dist': 13, 'arn_dist': 10.564078, 'in_contact': False}
{'res_1': 341, 'res_2': 168, 'sig_1': 'K', 'sig_2': 'I', 'aa_dist': 173, 'arn_dist': 42.976677, 'in_contact': False}
{'res_1': 162, 'res_2': 109, 'sig_1': 'R', 'sig_2': 'D', 'aa_dist': 53, 'arn_dist': 36.617775, 'in_contact': False}


In [18]:
in_contact_sites['1R4Q']

[{'res_1': 23,
  'res_2': 20,
  'sig_1': 'A',
  'sig_2': 'I',
  'aa_dist': 3,
  'arn_dist': 4.931343,
  'in_contact': True},
 {'res_1': 37,
  'res_2': 32,
  'sig_1': 'S',
  'sig_2': 'S',
  'aa_dist': 5,
  'arn_dist': 4.50448,
  'in_contact': True},
 {'res_1': 39,
  'res_2': 27,
  'sig_1': 'L',
  'sig_2': 'P',
  'aa_dist': 12,
  'arn_dist': 4.2691827,
  'in_contact': True},
 {'res_1': 41,
  'res_2': 25,
  'sig_1': 'I',
  'sig_2': 'G',
  'aa_dist': 16,
  'arn_dist': 4.1866007,
  'in_contact': True},
 {'res_1': 42,
  'res_2': 25,
  'sig_1': 'D',
  'sig_2': 'G',
  'aa_dist': 17,
  'arn_dist': 4.9172235,
  'in_contact': True},
 {'res_1': 50,
  'res_2': 1,
  'sig_1': 'F',
  'sig_2': 'K',
  'aa_dist': 49,
  'arn_dist': 4.371932,
  'in_contact': True},
 {'res_1': 51,
  'res_2': 2,
  'sig_1': 'A',
  'sig_2': 'E',
  'aa_dist': 49,
  'arn_dist': 4.864038,
  'in_contact': True},
 {'res_1': 52,
  'res_2': 3,
  'sig_1': 'V',
  'sig_2': 'F',
  'aa_dist': 49,
  'arn_dist': 4.7765656,
  'in_contact': T

In [20]:
non_contact_sites['1DM0']

[]

In [22]:
in_contact_sites.keys()

dict_keys(['1UE8', '1F8E', '1J5B', '1P0Z', '1N3K', '1JDM', '1NSH', '1Y5O', '2A55', '1RDG', '1XNE', '1AOY', '1XW3', '1G12', '2BCW', '1CTJ', '1R5S', '1MWW', '1SNB', '1CB1', '2CS2', '1W66', '1M4Y', '1WR0', '1WY8', '1K8O', '1KP6', '1WQB', '1GQN', '1ZU2', '1QD9', '1Z14', '1RYK', '1C3Z', '1AGG', '1WIB', '1ANP', '1APQ', '1QTX', '1X5K', '1CZN', '3CRD', '1LFD', '2B7T', '1NRZ', '1G8A', '1VYX', '1H20', '1DWN', '1PR9', '1WFO', '1KOZ', '1Y74', '2EZL', '1JJI', '1BTN', '1F53', '1CDT', '1F2D', '1WWX', '1RW0', '1GJX', '2U2F', '1SHX', '1MOL', '2A4C', '2BEM', '2FRH', '1RMK', '1K6K', '1WM8', '1G73', '1EQP', '2CY3', '1X6F', '1V60', '1TNR', '1NDG', '2FZ0', '2AB9', '1Q08', '1T0P', '8PAZ', '1GJZ', '1JFK', '2COQ', '2AEN', '1ZXF', '2MHR', '1NQU', '2CUN', '3TMK', '1X6D', '1OTF', '1KVJ', '2FVT', '1GAI', '2D42', '1WXL', '1HDJ', '1G7N', '2B7V', '1APS', '1X5I', '2FDN', '1WWZ', '1RW2', '1JAC', '1YX8', '1TIZ', '1NYK', '1QM7', '1FGL', '1C44', '1TH7', '1C01', '2B5R', '1PML', '1PSM', '1HBK', '1LY1', '1SIS', '1ZHC', '1RRA