In [4]:
from Bio import SeqIO
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import warnings

cos = nn.CosineSimilarity(dim=0, eps=1e-6)
import esm

import matplotlib.pyplot as plt
import seaborn as sns

from Bio.PDB import PDBParser, PPBuilder
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.SeqUtils import IUPACData

from collections import defaultdict
import os
import sys
import urllib.request
import Bio.PDB
import random
import glob
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import GridSearchCV



In [5]:
# Set directories

base_dir = '/Users/williamharrigan/Desktop/Github/contact_site_classifier/attention_classifier/data_files/'
desktop = '/Users/williamharrigan/Desktop/'
fasta_file = base_dir + 'rcsb_pdb_3KYN.fasta'
pdb_filename = base_dir + '3kyn.pdb'
structure_dir = base_dir +'structure_files/'
fasta_dir = base_dir +'fasta_files/'
casp_dir = '/Users/williamharrigan/Desktop/UH/Year_2/Research/contact_site_classifier/casp7/' 
casp_95 = casp_dir + 'training_95'

In [6]:
## This code is not important right now, will be when adding multiple sequences

# seqs = SeqIO.to_dict(SeqIO.parse(full_len_sequences, "fasta"))
# for k,v in seqs.items():
#     seqs[k] = str(v.seq)
    
# print(list(seqs.keys())[:5])

In [7]:
def simple_aa(three_letter_code):
    return IUPACData.protein_letters_3to1.get(three_letter_code.capitalize())

## To Do:


1. Get sequence IDs from CASP training file
2. Extract sequence from sequence ID from PDB database
3. Extract .pdb structural file from PDB for same ID
4. Double check that seqeunce files that are extracted are the same as the structural files
5. Get embeddings from extracted sequences (try 100 sequences first)
6. Turn contact site code into functions that can be run iteratively
7. Integrate this code with the classifier
    1. Make sure that when training model we balance positive and negative cases. (Start with 50/50 balance)
    2. Do so by making contact and non-contact dictionaries to pull training data from
8. Start working on implementing statstics like AUROC, loss functions and other hyperparameter optimization tasks. 

## Extract Sequence IDs and Structure (.pdb) files

In [31]:
def get_casp_ids(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('[ID]'):
                sequence_id = next(file).strip()
                print(sequence_id)

In [8]:
def parse_casp7_file(file_path):
    data_dict = {}

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('[ID]'):
                sequence_id = next(file).strip()
            elif line.startswith('[PRIMARY]'):
                sequence = next(file).strip()
#                 print(sequence_id, sequence)
                data_dict[sequence_id] = sequence

    return data_dict

In [9]:
def download_pdb(pdb_id, structure_dir, downloadurl="http://files.rcsb.org/download/"):
    """
    Downloads a PDB file from the Internet and saves it in a data directory.
    :param pdbcode: The standard PDB ID e.g. '3ICB' or '3icb'
    :param datadir: The directory where the downloaded file will be saved
    :param downloadurl: The base PDB download URL, cf.
        `https://www.rcsb.org/pages/download/http#structures` for details
        Note that the unencrypted HTTP protocol is used by default
        to avoid spurious OpenSSL errors...
    :return: the full path to the downloaded PDB file or None if something went wrong
    """
    pdbfn = pdb_id + ".pdb"
    url = downloadurl + pdbfn
    outfnm = os.path.join(structure_dir, pdbfn)
    try:
        urllib.request.urlretrieve(url, outfnm)
        return pdbfn
    except Exception as err:
        # all sorts of things could have gone wrong...
        print(str(err), file=sys.stderr)
        return None

In [10]:
def download_fasta(pdb_id, fasta_dir, downloadurl="https://www.rcsb.org/fasta/entry/"):
    pdbfn = pdb_id
    url = downloadurl + pdbfn
    outfnm = os.path.join(fasta_dir, F'{pdbfn}.fasta')
    try:
        urllib.request.urlretrieve(url, outfnm)
        return pdbfn
    except Exception as err:
        # all sorts of things could have gone wrong...
        print(str(err), file=sys.stderr)
        return None

In [11]:
### HERE
# Turned off warnings for discontinuous data structures

warnings.simplefilter('ignore', PDBConstructionWarning)

# Load the structure from locally saved file
parser = PDBParser()

In [12]:
def check_sequences(pdb_id):
    # Parse pdb file and save as structure. The pdb file is where we are getting CA coordinates from.
    structure = parser.get_structure(pdb_id, f"{structure_dir+pdb_id}.pdb")

    # Extract desired protein structure from PDB structure (typically only 1 structure to choose from)
    protein_structure = structure[0]

    residue_position = 0
    mismatches = 0
#     print(pdb_id)
    if 'A' in protein_structure:
        for residue in protein_structure['A']:
            if 'CA' in residue:
                if residue_position < len(protein_data[pdb_id]):
                    if simple_aa(residue.resname) != protein_data[pdb_id][residue_position]:
#                         print(residue.id[1], simple_aa(residue.resname), protein_data[pdb_id][residue_position])
                        mismatches+=1
            residue_position+=1
        if mismatches == 0:
            same_sequence_ids.append(pdb_id)
#         print(pdb_id)
    return pdb_id


In [13]:
# 2. Prepare sequence input from a fasta file and already loaded .pdb file
# Load sequence file

# seq_chains = []

# for record in SeqIO.parse(fasta_file, "fasta"):
#     seq_chains.append(str(record.seq))

# pdb_id = record.id.split('|')[0].split('_')[0]
# protein_sequence = ''.join(seq_chains)
# Replace 'training_30.txt' with the path to your file

In [14]:
# Get sequences from CASP7 file
prot_data_dict = parse_casp7_file(casp_95)

protein_id_counts = Counter(protein_id.split('_')[0] for protein_id in prot_data_dict)

# Now, create the test list with only those protein_ids that occur exactly once
single_occurence_ids = [protein_id for protein_id, count in protein_id_counts.items() if count == 1]

In [None]:
# # Load sequence file

# protein_data = {}

# for pdb_id in single_occurence_ids:
#     check = download_fasta(pdb_id, fasta_dir)
#     if check == None:
#         continue
#     else:
# #         download_pdb(pdb_id, structure_dir)
#         fasta_file = fasta_dir + pdb_id + '.fasta'
#         for record in SeqIO.parse(fasta_file, "fasta"):
#             print(record.id.split('|')[0].split('_')[0])
#             print(len(str(record.seq)))
#             protein_data[record.id.split('|')[0].split('_')[0]] = str(record.seq)
#             break

# # pdb_id = record.id.split('|')[0].split('_')[0]
# # protein_sequence = ''.join(seq_chains)
# # Replace 'training_30.txt' with the path to your file

In [None]:
## Take pdb_ids that occur only once in CASP7 dataset and generate fasta files 
# of sequences from pdb
# Only sequences in first chain are taken to keep things simple down the line


# protein_data = {}

# for pdb_id in single_occurence_ids:
#     fasta_file = fasta_dir + pdb_id + '.fasta'
#     for record in SeqIO.parse(fasta_file, "fasta"):
#         print(record.id.split('|')[0].split('_')[0])
#         print(len(str(record.seq)))
#         protein_data[record.id.split('|')[0].split('_')[0]] = str(record.seq)
#         break

In [15]:
## IF FASTA FILES ALREADY GENERATED
## Load pdb_ids and corresponding sequence from already generated fasta files in fasta_dir
## Only sequences in first chain are taken to keep things simple down the line
## SHOULD BE 2681 KEYS

protein_data = {}

for i in glob.glob(f'{fasta_dir}/*'):
#     pdb_id = i.split('/')[9].split('.')[0]
    fasta_file = i
    for record in SeqIO.parse(fasta_file, "fasta"):
        print(record.id.split('|')[0].split('_')[0])
        print(len(str(record.seq)))
        protein_data[record.id.split('|')[0].split('_')[0]] = str(record.seq)
        break

1C4R
182
1UE8
367
1I24
404
1CPT
428
1F8E
388
1CNU
137
1J5B
38
1F3M
80
1P0Z
131
1N3K
130
1XTZ
264
1JDM
31
1NSH
101
1YC7
124
1Y5O
115
1Q1L
401
1LCS
211
2A55
133
1RDG
52
2A10
125
1DCU
357
2ETH
154
1W2L
99
2ACV
463
2ETS
128
2HP8
79
1XNE
113
1O6C
388
1NIW
148
1FSU
492
1AOY
78
1F3V
179
1KAE
434
1XW3
110
1XM7
195
1KPI
302
1G12
167
2BCW
65
1CTJ
89
1R5S
132
1STF
212
1MWW
128
1SNB
64
2F4Q
346
1CB1
78
1XAH
354
1TRN
224
1WU8
256
1KWA
88
2CS2
134
1L1J
239
1L5O
356
1W66
232
2CAL
154
1E0Z
128
1M4Y
171
1WR0
81
1I6U
37
1HSE
334
1WY8
89
1OY2
207
1TZK
338
1K8O
93
1CT5
256
1KP6
79
1DV8
128
1WQB
32
1WOC
103
1ILR
152
1UAY
242
1GQN
252
1TH5
74
2C0C
362
1ZU2
158
1V0L
313
1ZQ7
207
2F8J
347
1QZ8
113
1O5U
101
1QD9
124
2FT6
124
1Z14
549
1AD5
438
1RYK
69
2AQL
173
1LKV
232
1C3Z
108
1K7Y
577
1F32
149
2GHS
326
1AGG
48
1VI9
299
1H8U
117
1X8Y
86
2B69
343
1WIB
92
1ANP
28
1MGR
99
1APQ
53
2A8X
464
1QTX
148
1KZH
555
1X5K
124
1TJ1
602
1CZN
169
1X1N
524
3CRD
100
1LFD
87
1ZES
125
2B7T
73
1NRZ
164
1G8A
227
1VYX
60
1WN5
130
1H2

1NMB
470
2BZ1
196
1L2L
457
1W15
153
1SPH
88
1DZ3
130
1V8Y
170
1QDV
99
1YZQ
170
1LLN
262
1SB6
64
2AHU
531
2AVT
378
1TRE
255
1XEF
241
2F4Z
193
1ET1
34
1Z65
30
1R26
125
1NWD
148
1FSG
233
1BDB
277
1UP8
598
1G2R
100
1E07
642
1WHY
97
1ZR3
211
1BR9
194
1YKD
398
2BHM
164
1P0S
138
1UBF
349
1YOA
159
2BCE
579
2B18
164
1KNZ
5
1Q27
171
2FPE
62
1D2S
170
1THX
115
2ARH
203
1Z98
281
1ZKE
83
2AVM
99
1S4W
20
1RKX
357
1VFY
73
1GO9
42
1O9G
250
1BKT
38
1EIY
76
6RXN
46
1HW6
278
1WYN
146
1WCJ
103
1S83
223
2CP6
172
1QZU
206
1KAL
29
1UFX
103
2C3G
98
1QKY
38
1H4G
207
1MFR
176
1WLG
299
1IUR
88
1OZ6
120
1G6L
203
1RDU
116
2C0J
161
1CXC
124
1TL9
339
2G9N
221
1UAP
154
1SFN
246
1HE7
126
1H7J
26
1WKO
180
1PSF
69
1FTT
68
1XWE
151
1NNV
110
1SN4
64
1VTX
42
1W1H
151
1SJ1
66
1RGX
114
1HSL
238
2AZV
59
1DUC
134
1RVT
328
1CE0
37
1TUO
464
1C3H
137
1TQJ
230
1QYC
308
1MJI
34
1I10
331
1BHB
71
1PFT
50
1XBW
109
1TKU
204
1S7Z
117
1DUX
13
1RYB
205
2A61
145
1JCL
260
1UW0
117
1AVB
226
1M79
192
1YLL
200
1NB2
150
1WFF
85
1Z0P
84
1ELU
390


In [None]:
# # Prepare sequence input to ESM/contact analysis from CASP7 dataset
# # Load CASP file containing PDB IDs and Sequences

# protein_data = parse_casp7_file(casp)
# same_sequence_ids = []

# for pdb_id, protein_sequence in list(protein_data.items()):
#     print('Sequence ID: ', pdb_id)
# #     print('Sequence: ', protein_sequence, len(protein_sequence), '\n')
    
#     # Download pdb structure files from PDB database using CASP7 protein ids
#     print('PDB Structural File Output: ', download_pdb(pdb_id, structure_dir), '\n')
#     check_sequences(pdb_id)
# #     Index protein sequence as sequence 0 (next sequence would be indexed as 1)
#     esm_input_data = [(0, protein_sequence)]
#     print('Data: ', esm_input_data, '\n')

#     # Prepare variables to input sequence into ESM-2 model 
#     batch_converter = alphabet.get_batch_converter()
#     batch_labels, batch_strs, batch_tokens = batch_converter(esm_input_data)
#     batch_tokens = batch_tokens.cuda() if torch.cuda.is_available() else batch_tokens
    
#     print('batch_tokens: ', '\n\n', batch_tokens, '\n')

In [16]:
## IF PDB FILES ARE ALREADY LOADED
# Should have 1215 perfect match sequences (same_sequence_ids)
# protein_data = parse_casp7_file(casp)

same_sequence_ids = []
iterations = 0
for pdb_id, protein_sequence in list(protein_data.items()):
    print("Iterations: ", iterations)
    print('Sequence ID: ', pdb_id)
    print('No mismatches in pdb sequence and pulled sequence: ', check_sequences(pdb_id))
    iterations+=1

Iterations:  0
Sequence ID:  1C4R
No mismatches in pdb sequence and pulled sequence:  1C4R
Iterations:  1
Sequence ID:  1UE8
No mismatches in pdb sequence and pulled sequence:  1UE8
Iterations:  2
Sequence ID:  1I24
No mismatches in pdb sequence and pulled sequence:  1I24
Iterations:  3
Sequence ID:  1CPT
No mismatches in pdb sequence and pulled sequence:  1CPT
Iterations:  4
Sequence ID:  1F8E
No mismatches in pdb sequence and pulled sequence:  1F8E
Iterations:  5
Sequence ID:  1CNU
No mismatches in pdb sequence and pulled sequence:  1CNU
Iterations:  6
Sequence ID:  1J5B
No mismatches in pdb sequence and pulled sequence:  1J5B
Iterations:  7
Sequence ID:  1F3M
No mismatches in pdb sequence and pulled sequence:  1F3M
Iterations:  8
Sequence ID:  1P0Z
No mismatches in pdb sequence and pulled sequence:  1P0Z
Iterations:  9
Sequence ID:  1N3K
No mismatches in pdb sequence and pulled sequence:  1N3K
Iterations:  10
Sequence ID:  1XTZ
No mismatches in pdb sequence and pulled sequence:  1XT

No mismatches in pdb sequence and pulled sequence:  1AGG
Iterations:  91
Sequence ID:  1VI9
No mismatches in pdb sequence and pulled sequence:  1VI9
Iterations:  92
Sequence ID:  1H8U
No mismatches in pdb sequence and pulled sequence:  1H8U
Iterations:  93
Sequence ID:  1X8Y
No mismatches in pdb sequence and pulled sequence:  1X8Y
Iterations:  94
Sequence ID:  2B69
No mismatches in pdb sequence and pulled sequence:  2B69
Iterations:  95
Sequence ID:  1WIB
No mismatches in pdb sequence and pulled sequence:  1WIB
Iterations:  96
Sequence ID:  1ANP
No mismatches in pdb sequence and pulled sequence:  1ANP
Iterations:  97
Sequence ID:  1MGR
No mismatches in pdb sequence and pulled sequence:  1MGR
Iterations:  98
Sequence ID:  1APQ
No mismatches in pdb sequence and pulled sequence:  1APQ
Iterations:  99
Sequence ID:  2A8X
No mismatches in pdb sequence and pulled sequence:  2A8X
Iterations:  100
Sequence ID:  1QTX
No mismatches in pdb sequence and pulled sequence:  1QTX
Iterations:  101
Seque

No mismatches in pdb sequence and pulled sequence:  1V60
Iterations:  188
Sequence ID:  1EG2
No mismatches in pdb sequence and pulled sequence:  1EG2
Iterations:  189
Sequence ID:  1TNR
No mismatches in pdb sequence and pulled sequence:  1TNR
Iterations:  190
Sequence ID:  1NDG
No mismatches in pdb sequence and pulled sequence:  1NDG
Iterations:  191
Sequence ID:  2FZ0
No mismatches in pdb sequence and pulled sequence:  2FZ0
Iterations:  192
Sequence ID:  1PGV
No mismatches in pdb sequence and pulled sequence:  1PGV
Iterations:  193
Sequence ID:  2BM5
No mismatches in pdb sequence and pulled sequence:  2BM5
Iterations:  194
Sequence ID:  1TJW
No mismatches in pdb sequence and pulled sequence:  1TJW
Iterations:  195
Sequence ID:  1VR6
No mismatches in pdb sequence and pulled sequence:  1VR6
Iterations:  196
Sequence ID:  2AB9
No mismatches in pdb sequence and pulled sequence:  2AB9
Iterations:  197
Sequence ID:  1SHZ
No mismatches in pdb sequence and pulled sequence:  1SHZ
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1TH7
Iterations:  281
Sequence ID:  1C01
No mismatches in pdb sequence and pulled sequence:  1C01
Iterations:  282
Sequence ID:  1VJR
No mismatches in pdb sequence and pulled sequence:  1VJR
Iterations:  283
Sequence ID:  2B5R
No mismatches in pdb sequence and pulled sequence:  2B5R
Iterations:  284
Sequence ID:  1PML
No mismatches in pdb sequence and pulled sequence:  1PML
Iterations:  285
Sequence ID:  1GY6
No mismatches in pdb sequence and pulled sequence:  1GY6
Iterations:  286
Sequence ID:  1PSM
No mismatches in pdb sequence and pulled sequence:  1PSM
Iterations:  287
Sequence ID:  1NEU
No mismatches in pdb sequence and pulled sequence:  1NEU
Iterations:  288
Sequence ID:  1FAV
No mismatches in pdb sequence and pulled sequence:  1FAV
Iterations:  289
Sequence ID:  1U4Q
No mismatches in pdb sequence and pulled sequence:  1U4Q
Iterations:  290
Sequence ID:  2GHJ
No mismatches in pdb sequence and pulled sequence:  2GHJ
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1ILO
Iterations:  380
Sequence ID:  1IYF
No mismatches in pdb sequence and pulled sequence:  1IYF
Iterations:  381
Sequence ID:  2A5L
No mismatches in pdb sequence and pulled sequence:  2A5L
Iterations:  382
Sequence ID:  1V38
No mismatches in pdb sequence and pulled sequence:  1V38
Iterations:  383
Sequence ID:  1RRE
No mismatches in pdb sequence and pulled sequence:  1RRE
Iterations:  384
Sequence ID:  1ZLB
No mismatches in pdb sequence and pulled sequence:  1ZLB
Iterations:  385
Sequence ID:  1DOS
No mismatches in pdb sequence and pulled sequence:  1DOS
Iterations:  386
Sequence ID:  1OV9
No mismatches in pdb sequence and pulled sequence:  1OV9
Iterations:  387
Sequence ID:  1ZRX
No mismatches in pdb sequence and pulled sequence:  1ZRX
Iterations:  388
Sequence ID:  1WL7
No mismatches in pdb sequence and pulled sequence:  1WL7
Iterations:  389
Sequence ID:  1LKK
No mismatches in pdb sequence and pulled sequence:  1LKK
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1YWU
Iterations:  472
Sequence ID:  1YSP
No mismatches in pdb sequence and pulled sequence:  1YSP
Iterations:  473
Sequence ID:  1F1O
No mismatches in pdb sequence and pulled sequence:  1F1O
Iterations:  474
Sequence ID:  2FGG
No mismatches in pdb sequence and pulled sequence:  2FGG
Iterations:  475
Sequence ID:  2C5R
No mismatches in pdb sequence and pulled sequence:  2C5R
Iterations:  476
Sequence ID:  1V1C
No mismatches in pdb sequence and pulled sequence:  1V1C
Iterations:  477
Sequence ID:  1E69
No mismatches in pdb sequence and pulled sequence:  1E69
Iterations:  478
Sequence ID:  1MDB
No mismatches in pdb sequence and pulled sequence:  1MDB
Iterations:  479
Sequence ID:  1XVX
No mismatches in pdb sequence and pulled sequence:  1XVX
Iterations:  480
Sequence ID:  1UR3
No mismatches in pdb sequence and pulled sequence:  1UR3
Iterations:  481
Sequence ID:  2GFQ
No mismatches in pdb sequence and pulled sequence:  2GFQ
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1X5O
Iterations:  568
Sequence ID:  1SH8
No mismatches in pdb sequence and pulled sequence:  1SH8
Iterations:  569
Sequence ID:  1XQI
No mismatches in pdb sequence and pulled sequence:  1XQI
Iterations:  570
Sequence ID:  1TBO
No mismatches in pdb sequence and pulled sequence:  1TBO
Iterations:  571
Sequence ID:  1CL1
No mismatches in pdb sequence and pulled sequence:  1CL1
Iterations:  572
Sequence ID:  1IIZ
No mismatches in pdb sequence and pulled sequence:  1IIZ
Iterations:  573
Sequence ID:  2CC3
No mismatches in pdb sequence and pulled sequence:  2CC3
Iterations:  574
Sequence ID:  2CY7
No mismatches in pdb sequence and pulled sequence:  2CY7
Iterations:  575
Sequence ID:  1X6B
No mismatches in pdb sequence and pulled sequence:  1X6B
Iterations:  576
Sequence ID:  1GPC
No mismatches in pdb sequence and pulled sequence:  1GPC
Iterations:  577
Sequence ID:  1VQY
No mismatches in pdb sequence and pulled sequence:  1VQY
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1BD0
Iterations:  665
Sequence ID:  1FS5
No mismatches in pdb sequence and pulled sequence:  1FS5
Iterations:  666
Sequence ID:  2FFL
No mismatches in pdb sequence and pulled sequence:  2FFL
Iterations:  667
Sequence ID:  1QHB
No mismatches in pdb sequence and pulled sequence:  1QHB
Iterations:  668
Sequence ID:  1FB9
No mismatches in pdb sequence and pulled sequence:  1FB9
Iterations:  669
Sequence ID:  1EAJ
No mismatches in pdb sequence and pulled sequence:  1EAJ
Iterations:  670
Sequence ID:  1V0H
No mismatches in pdb sequence and pulled sequence:  1V0H
Iterations:  671
Sequence ID:  1RU0
No mismatches in pdb sequence and pulled sequence:  1RU0
Iterations:  672
Sequence ID:  1BCG
No mismatches in pdb sequence and pulled sequence:  1BCG
Iterations:  673
Sequence ID:  2A29
No mismatches in pdb sequence and pulled sequence:  2A29
Iterations:  674
Sequence ID:  1R53
No mismatches in pdb sequence and pulled sequence:  1R53
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1PYV
Iterations:  758
Sequence ID:  1VR7
No mismatches in pdb sequence and pulled sequence:  1VR7
Iterations:  759
Sequence ID:  1A05
No mismatches in pdb sequence and pulled sequence:  1A05
Iterations:  760
Sequence ID:  1IFK
No mismatches in pdb sequence and pulled sequence:  1IFK
Iterations:  761
Sequence ID:  1NV8
No mismatches in pdb sequence and pulled sequence:  1NV8
Iterations:  762
Sequence ID:  2NLL
No mismatches in pdb sequence and pulled sequence:  2NLL
Iterations:  763
Sequence ID:  1YSN
No mismatches in pdb sequence and pulled sequence:  1YSN
Iterations:  764
Sequence ID:  1U60
No mismatches in pdb sequence and pulled sequence:  1U60
Iterations:  765
Sequence ID:  1E29
No mismatches in pdb sequence and pulled sequence:  1E29
Iterations:  766
Sequence ID:  1J02
No mismatches in pdb sequence and pulled sequence:  1J02
Iterations:  767
Sequence ID:  2FQ9
No mismatches in pdb sequence and pulled sequence:  2FQ9
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1X3P
Iterations:  847
Sequence ID:  2F01
No mismatches in pdb sequence and pulled sequence:  2F01
Iterations:  848
Sequence ID:  1OUW
No mismatches in pdb sequence and pulled sequence:  1OUW
Iterations:  849
Sequence ID:  1MAI
No mismatches in pdb sequence and pulled sequence:  1MAI
Iterations:  850
Sequence ID:  1VB0
No mismatches in pdb sequence and pulled sequence:  1VB0
Iterations:  851
Sequence ID:  1H7Y
No mismatches in pdb sequence and pulled sequence:  1H7Y
Iterations:  852
Sequence ID:  1K8U
No mismatches in pdb sequence and pulled sequence:  1K8U
Iterations:  853
Sequence ID:  2CPZ
No mismatches in pdb sequence and pulled sequence:  2CPZ
Iterations:  854
Sequence ID:  1QZ9
No mismatches in pdb sequence and pulled sequence:  1QZ9
Iterations:  855
Sequence ID:  2FJ6
No mismatches in pdb sequence and pulled sequence:  2FJ6
Iterations:  856
Sequence ID:  1A6I
No mismatches in pdb sequence and pulled sequence:  1A6I
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1ZSV
Iterations:  939
Sequence ID:  1E5W
No mismatches in pdb sequence and pulled sequence:  1E5W
Iterations:  940
Sequence ID:  2ATZ
No mismatches in pdb sequence and pulled sequence:  2ATZ
Iterations:  941
Sequence ID:  1XU6
No mismatches in pdb sequence and pulled sequence:  1XU6
Iterations:  942
Sequence ID:  1QFX
No mismatches in pdb sequence and pulled sequence:  1QFX
Iterations:  943
Sequence ID:  1QXY
No mismatches in pdb sequence and pulled sequence:  1QXY
Iterations:  944
Sequence ID:  1EQQ
No mismatches in pdb sequence and pulled sequence:  1EQQ
Iterations:  945
Sequence ID:  1SGO
No mismatches in pdb sequence and pulled sequence:  1SGO
Iterations:  946
Sequence ID:  1WJN
No mismatches in pdb sequence and pulled sequence:  1WJN
Iterations:  947
Sequence ID:  1QIU
No mismatches in pdb sequence and pulled sequence:  1QIU
Iterations:  948
Sequence ID:  1RFY
No mismatches in pdb sequence and pulled sequence:  1RFY
Iterations:  

No mismatches in pdb sequence and pulled sequence:  1WJQ
Iterations:  1034
Sequence ID:  1DM0
No mismatches in pdb sequence and pulled sequence:  1DM0
Iterations:  1035
Sequence ID:  1WPN
No mismatches in pdb sequence and pulled sequence:  1WPN
Iterations:  1036
Sequence ID:  1DS1
No mismatches in pdb sequence and pulled sequence:  1DS1
Iterations:  1037
Sequence ID:  4SBV
No mismatches in pdb sequence and pulled sequence:  4SBV
Iterations:  1038
Sequence ID:  1WF5
No mismatches in pdb sequence and pulled sequence:  1WF5
Iterations:  1039
Sequence ID:  1W0M
No mismatches in pdb sequence and pulled sequence:  1W0M
Iterations:  1040
Sequence ID:  2MLP
No mismatches in pdb sequence and pulled sequence:  2MLP
Iterations:  1041
Sequence ID:  1PLB
No mismatches in pdb sequence and pulled sequence:  1PLB
Iterations:  1042
Sequence ID:  1G4D
No mismatches in pdb sequence and pulled sequence:  1G4D
Iterations:  1043
Sequence ID:  1FQT
No mismatches in pdb sequence and pulled sequence:  1FQT
Ite

No mismatches in pdb sequence and pulled sequence:  1NEI
Iterations:  1125
Sequence ID:  1IYE
No mismatches in pdb sequence and pulled sequence:  1IYE
Iterations:  1126
Sequence ID:  1CIX
No mismatches in pdb sequence and pulled sequence:  1CIX
Iterations:  1127
Sequence ID:  1UPK
No mismatches in pdb sequence and pulled sequence:  1UPK
Iterations:  1128
Sequence ID:  1QGN
No mismatches in pdb sequence and pulled sequence:  1QGN
Iterations:  1129
Sequence ID:  2FBH
No mismatches in pdb sequence and pulled sequence:  2FBH
Iterations:  1130
Sequence ID:  1P3R
No mismatches in pdb sequence and pulled sequence:  1P3R
Iterations:  1131
Sequence ID:  1X7Q
No mismatches in pdb sequence and pulled sequence:  1X7Q
Iterations:  1132
Sequence ID:  1QVB
No mismatches in pdb sequence and pulled sequence:  1QVB
Iterations:  1133
Sequence ID:  1ARK
No mismatches in pdb sequence and pulled sequence:  1ARK
Iterations:  1134
Sequence ID:  1BCF
No mismatches in pdb sequence and pulled sequence:  1BCF
Ite

No mismatches in pdb sequence and pulled sequence:  1JDQ
Iterations:  1213
Sequence ID:  1THM
No mismatches in pdb sequence and pulled sequence:  1THM
Iterations:  1214
Sequence ID:  1XAO
No mismatches in pdb sequence and pulled sequence:  1XAO
Iterations:  1215
Sequence ID:  1GQ6
No mismatches in pdb sequence and pulled sequence:  1GQ6
Iterations:  1216
Sequence ID:  1G9O
No mismatches in pdb sequence and pulled sequence:  1G9O
Iterations:  1217
Sequence ID:  1M3V
No mismatches in pdb sequence and pulled sequence:  1M3V
Iterations:  1218
Sequence ID:  1CJL
No mismatches in pdb sequence and pulled sequence:  1CJL
Iterations:  1219
Sequence ID:  1O13
No mismatches in pdb sequence and pulled sequence:  1O13
Iterations:  1220
Sequence ID:  1MWP
No mismatches in pdb sequence and pulled sequence:  1MWP
Iterations:  1221
Sequence ID:  1R1Q
No mismatches in pdb sequence and pulled sequence:  1R1Q
Iterations:  1222
Sequence ID:  1AZW
No mismatches in pdb sequence and pulled sequence:  1AZW
Ite

No mismatches in pdb sequence and pulled sequence:  1DAV
Iterations:  1309
Sequence ID:  1HLQ
No mismatches in pdb sequence and pulled sequence:  1HLQ
Iterations:  1310
Sequence ID:  1W0T
No mismatches in pdb sequence and pulled sequence:  1W0T
Iterations:  1311
Sequence ID:  1S2X
No mismatches in pdb sequence and pulled sequence:  1S2X
Iterations:  1312
Sequence ID:  1JSG
No mismatches in pdb sequence and pulled sequence:  1JSG
Iterations:  1313
Sequence ID:  1U5F
No mismatches in pdb sequence and pulled sequence:  1U5F
Iterations:  1314
Sequence ID:  1IBJ
No mismatches in pdb sequence and pulled sequence:  1IBJ
Iterations:  1315
Sequence ID:  1AXH
No mismatches in pdb sequence and pulled sequence:  1AXH
Iterations:  1316
Sequence ID:  1I42
No mismatches in pdb sequence and pulled sequence:  1I42
Iterations:  1317
Sequence ID:  1KRQ
No mismatches in pdb sequence and pulled sequence:  1KRQ
Iterations:  1318
Sequence ID:  1YXB
No mismatches in pdb sequence and pulled sequence:  1YXB
Ite

No mismatches in pdb sequence and pulled sequence:  1WFD
Iterations:  1400
Sequence ID:  1S98
No mismatches in pdb sequence and pulled sequence:  1S98
Iterations:  1401
Sequence ID:  1ERV
No mismatches in pdb sequence and pulled sequence:  1ERV
Iterations:  1402
Sequence ID:  2HVM
No mismatches in pdb sequence and pulled sequence:  2HVM
Iterations:  1403
Sequence ID:  1WBA
No mismatches in pdb sequence and pulled sequence:  1WBA
Iterations:  1404
Sequence ID:  2PLC
No mismatches in pdb sequence and pulled sequence:  2PLC
Iterations:  1405
Sequence ID:  1TE7
No mismatches in pdb sequence and pulled sequence:  1TE7
Iterations:  1406
Sequence ID:  2FUP
No mismatches in pdb sequence and pulled sequence:  2FUP
Iterations:  1407
Sequence ID:  1B4F
No mismatches in pdb sequence and pulled sequence:  1B4F
Iterations:  1408
Sequence ID:  1KOQ
No mismatches in pdb sequence and pulled sequence:  1KOQ
Iterations:  1409
Sequence ID:  1GXR
No mismatches in pdb sequence and pulled sequence:  1GXR
Ite

No mismatches in pdb sequence and pulled sequence:  2BYC
Iterations:  1492
Sequence ID:  2FJC
No mismatches in pdb sequence and pulled sequence:  2FJC
Iterations:  1493
Sequence ID:  2EW1
No mismatches in pdb sequence and pulled sequence:  2EW1
Iterations:  1494
Sequence ID:  1SJW
No mismatches in pdb sequence and pulled sequence:  1SJW
Iterations:  1495
Sequence ID:  1L6R
No mismatches in pdb sequence and pulled sequence:  1L6R
Iterations:  1496
Sequence ID:  1X4R
No mismatches in pdb sequence and pulled sequence:  1X4R
Iterations:  1497
Sequence ID:  1F7C
No mismatches in pdb sequence and pulled sequence:  1F7C
Iterations:  1498
Sequence ID:  1YQB
No mismatches in pdb sequence and pulled sequence:  1YQB
Iterations:  1499
Sequence ID:  1U09
No mismatches in pdb sequence and pulled sequence:  1U09
Iterations:  1500
Sequence ID:  1YOC
No mismatches in pdb sequence and pulled sequence:  1YOC
Iterations:  1501
Sequence ID:  1ORU
No mismatches in pdb sequence and pulled sequence:  1ORU
Ite

No mismatches in pdb sequence and pulled sequence:  1UAP
Iterations:  1583
Sequence ID:  1SFN
No mismatches in pdb sequence and pulled sequence:  1SFN
Iterations:  1584
Sequence ID:  1HE7
No mismatches in pdb sequence and pulled sequence:  1HE7
Iterations:  1585
Sequence ID:  1H7J
No mismatches in pdb sequence and pulled sequence:  1H7J
Iterations:  1586
Sequence ID:  1WKO
No mismatches in pdb sequence and pulled sequence:  1WKO
Iterations:  1587
Sequence ID:  1PSF
No mismatches in pdb sequence and pulled sequence:  1PSF
Iterations:  1588
Sequence ID:  1FTT
No mismatches in pdb sequence and pulled sequence:  1FTT
Iterations:  1589
Sequence ID:  1XWE
No mismatches in pdb sequence and pulled sequence:  1XWE
Iterations:  1590
Sequence ID:  1NNV
No mismatches in pdb sequence and pulled sequence:  1NNV
Iterations:  1591
Sequence ID:  1SN4
No mismatches in pdb sequence and pulled sequence:  1SN4
Iterations:  1592
Sequence ID:  1VTX
No mismatches in pdb sequence and pulled sequence:  1VTX
Ite

No mismatches in pdb sequence and pulled sequence:  1QWG
Iterations:  1682
Sequence ID:  1AA0
No mismatches in pdb sequence and pulled sequence:  1AA0
Iterations:  1683
Sequence ID:  2AEE
No mismatches in pdb sequence and pulled sequence:  2AEE
Iterations:  1684
Sequence ID:  1Z41
No mismatches in pdb sequence and pulled sequence:  1Z41
Iterations:  1685
Sequence ID:  1IA6
No mismatches in pdb sequence and pulled sequence:  1IA6
Iterations:  1686
Sequence ID:  5XIN
No mismatches in pdb sequence and pulled sequence:  5XIN
Iterations:  1687
Sequence ID:  1JBE
No mismatches in pdb sequence and pulled sequence:  1JBE
Iterations:  1688
Sequence ID:  2COZ
No mismatches in pdb sequence and pulled sequence:  2COZ
Iterations:  1689
Sequence ID:  1NQE
No mismatches in pdb sequence and pulled sequence:  1NQE
Iterations:  1690
Sequence ID:  1XVW
No mismatches in pdb sequence and pulled sequence:  1XVW
Iterations:  1691
Sequence ID:  1XRI
No mismatches in pdb sequence and pulled sequence:  1XRI
Ite

No mismatches in pdb sequence and pulled sequence:  1MZH
Iterations:  1770
Sequence ID:  2F51
No mismatches in pdb sequence and pulled sequence:  2F51
Iterations:  1771
Sequence ID:  1OTR
No mismatches in pdb sequence and pulled sequence:  1OTR
Iterations:  1772
Sequence ID:  1FC9
No mismatches in pdb sequence and pulled sequence:  1FC9
Iterations:  1773
Sequence ID:  1UDC
No mismatches in pdb sequence and pulled sequence:  1UDC
Iterations:  1774
Sequence ID:  1ZBM
No mismatches in pdb sequence and pulled sequence:  1ZBM
Iterations:  1775
Sequence ID:  2FU2
No mismatches in pdb sequence and pulled sequence:  2FU2
Iterations:  1776
Sequence ID:  1TJX
No mismatches in pdb sequence and pulled sequence:  1TJX
Iterations:  1777
Sequence ID:  2F2F
No mismatches in pdb sequence and pulled sequence:  2F2F
Iterations:  1778
Sequence ID:  1U5L
No mismatches in pdb sequence and pulled sequence:  1U5L
Iterations:  1779
Sequence ID:  1HCV
No mismatches in pdb sequence and pulled sequence:  1HCV
Ite

No mismatches in pdb sequence and pulled sequence:  1XN7
Iterations:  1863
Sequence ID:  1PE9
No mismatches in pdb sequence and pulled sequence:  1PE9
Iterations:  1864
Sequence ID:  1CFF
No mismatches in pdb sequence and pulled sequence:  1CFF
Iterations:  1865
Sequence ID:  1B9R
No mismatches in pdb sequence and pulled sequence:  1B9R
Iterations:  1866
Sequence ID:  1V0Z
No mismatches in pdb sequence and pulled sequence:  1V0Z
Iterations:  1867
Sequence ID:  1Y2Q
No mismatches in pdb sequence and pulled sequence:  1Y2Q
Iterations:  1868
Sequence ID:  1NTV
No mismatches in pdb sequence and pulled sequence:  1NTV
Iterations:  1869
Sequence ID:  1ZGZ
No mismatches in pdb sequence and pulled sequence:  1ZGZ
Iterations:  1870
Sequence ID:  2BH4
No mismatches in pdb sequence and pulled sequence:  2BH4
Iterations:  1871
Sequence ID:  1BHD
No mismatches in pdb sequence and pulled sequence:  1BHD
Iterations:  1872
Sequence ID:  2BV5
No mismatches in pdb sequence and pulled sequence:  2BV5
Ite

No mismatches in pdb sequence and pulled sequence:  2FNB
Iterations:  1951
Sequence ID:  1Y62
No mismatches in pdb sequence and pulled sequence:  1Y62
Iterations:  1952
Sequence ID:  1WYS
No mismatches in pdb sequence and pulled sequence:  1WYS
Iterations:  1953
Sequence ID:  2ES0
No mismatches in pdb sequence and pulled sequence:  2ES0
Iterations:  1954
Sequence ID:  1WGR
No mismatches in pdb sequence and pulled sequence:  1WGR
Iterations:  1955
Sequence ID:  1KCN
No mismatches in pdb sequence and pulled sequence:  1KCN
Iterations:  1956
Sequence ID:  1OTK
No mismatches in pdb sequence and pulled sequence:  1OTK
Iterations:  1957
Sequence ID:  1TM6
No mismatches in pdb sequence and pulled sequence:  1TM6
Iterations:  1958
Sequence ID:  1X2L
No mismatches in pdb sequence and pulled sequence:  1X2L
Iterations:  1959
Sequence ID:  1OJJ
No mismatches in pdb sequence and pulled sequence:  1OJJ
Iterations:  1960
Sequence ID:  1ISU
No mismatches in pdb sequence and pulled sequence:  1ISU
Ite

No mismatches in pdb sequence and pulled sequence:  2BL6
Iterations:  2039
Sequence ID:  1C7W
No mismatches in pdb sequence and pulled sequence:  1C7W
Iterations:  2040
Sequence ID:  1RLJ
No mismatches in pdb sequence and pulled sequence:  1RLJ
Iterations:  2041
Sequence ID:  2SAS
No mismatches in pdb sequence and pulled sequence:  2SAS
Iterations:  2042
Sequence ID:  1AO5
No mismatches in pdb sequence and pulled sequence:  1AO5
Iterations:  2043
Sequence ID:  1EM9
No mismatches in pdb sequence and pulled sequence:  1EM9
Iterations:  2044
Sequence ID:  1Q2I
No mismatches in pdb sequence and pulled sequence:  1Q2I
Iterations:  2045
Sequence ID:  1UW1
No mismatches in pdb sequence and pulled sequence:  1UW1
Iterations:  2046
Sequence ID:  1O1X
No mismatches in pdb sequence and pulled sequence:  1O1X
Iterations:  2047
Sequence ID:  1NPM
No mismatches in pdb sequence and pulled sequence:  1NPM
Iterations:  2048
Sequence ID:  1UI0
No mismatches in pdb sequence and pulled sequence:  1UI0
Ite

No mismatches in pdb sequence and pulled sequence:  2CH7
Iterations:  2130
Sequence ID:  1RSY
No mismatches in pdb sequence and pulled sequence:  1RSY
Iterations:  2131
Sequence ID:  1S2L
No mismatches in pdb sequence and pulled sequence:  1S2L
Iterations:  2132
Sequence ID:  1U5R
No mismatches in pdb sequence and pulled sequence:  1U5R
Iterations:  2133
Sequence ID:  1FZQ
No mismatches in pdb sequence and pulled sequence:  1FZQ
Iterations:  2134
Sequence ID:  1YJ3
No mismatches in pdb sequence and pulled sequence:  1YJ3
Iterations:  2135
Sequence ID:  2CZR
No mismatches in pdb sequence and pulled sequence:  2CZR
Iterations:  2136
Sequence ID:  2MYO
No mismatches in pdb sequence and pulled sequence:  2MYO
Iterations:  2137
Sequence ID:  1T0Z
No mismatches in pdb sequence and pulled sequence:  1T0Z
Iterations:  2138
Sequence ID:  1EUF
No mismatches in pdb sequence and pulled sequence:  1EUF
Iterations:  2139
Sequence ID:  1Z3G
No mismatches in pdb sequence and pulled sequence:  1Z3G
Ite

No mismatches in pdb sequence and pulled sequence:  2D4E
Iterations:  2222
Sequence ID:  1VYR
No mismatches in pdb sequence and pulled sequence:  1VYR
Iterations:  2223
Sequence ID:  1OP4
No mismatches in pdb sequence and pulled sequence:  1OP4
Iterations:  2224
Sequence ID:  1EHS
No mismatches in pdb sequence and pulled sequence:  1EHS
Iterations:  2225
Sequence ID:  1Z0S
No mismatches in pdb sequence and pulled sequence:  1Z0S
Iterations:  2226
Sequence ID:  1SZ7
No mismatches in pdb sequence and pulled sequence:  1SZ7
Iterations:  2227
Sequence ID:  1ABZ
No mismatches in pdb sequence and pulled sequence:  1ABZ
Iterations:  2228
Sequence ID:  1W30
No mismatches in pdb sequence and pulled sequence:  1W30
Iterations:  2229
Sequence ID:  1T0C
No mismatches in pdb sequence and pulled sequence:  1T0C
Iterations:  2230
Sequence ID:  1KLF
No mismatches in pdb sequence and pulled sequence:  1KLF
Iterations:  2231
Sequence ID:  1P9G
No mismatches in pdb sequence and pulled sequence:  1P9G
Ite

No mismatches in pdb sequence and pulled sequence:  1Y2M
Iterations:  2315
Sequence ID:  1NJK
No mismatches in pdb sequence and pulled sequence:  1NJK
Iterations:  2316
Sequence ID:  1C3P
No mismatches in pdb sequence and pulled sequence:  1C3P
Iterations:  2317
Sequence ID:  1XFQ
No mismatches in pdb sequence and pulled sequence:  1XFQ
Iterations:  2318
Sequence ID:  1HBX
No mismatches in pdb sequence and pulled sequence:  1HBX
Iterations:  2319
Sequence ID:  2AQF
No mismatches in pdb sequence and pulled sequence:  2AQF
Iterations:  2320
Sequence ID:  1VEL
No mismatches in pdb sequence and pulled sequence:  1VEL
Iterations:  2321
Sequence ID:  1CMR
No mismatches in pdb sequence and pulled sequence:  1CMR
Iterations:  2322
Sequence ID:  1KMT
No mismatches in pdb sequence and pulled sequence:  1KMT
Iterations:  2323
Sequence ID:  1HJ9
No mismatches in pdb sequence and pulled sequence:  1HJ9
Iterations:  2324
Sequence ID:  1G25
No mismatches in pdb sequence and pulled sequence:  1G25
Ite

No mismatches in pdb sequence and pulled sequence:  2BUF
Iterations:  2410
Sequence ID:  1YLK
No mismatches in pdb sequence and pulled sequence:  1YLK
Iterations:  2411
Sequence ID:  1VJE
No mismatches in pdb sequence and pulled sequence:  1VJE
Iterations:  2412
Sequence ID:  1NTN
No mismatches in pdb sequence and pulled sequence:  1NTN
Iterations:  2413
Sequence ID:  1J16
No mismatches in pdb sequence and pulled sequence:  1J16
Iterations:  2414
Sequence ID:  1UW7
No mismatches in pdb sequence and pulled sequence:  1UW7
Iterations:  2415
Sequence ID:  1FTH
No mismatches in pdb sequence and pulled sequence:  1FTH
Iterations:  2416
Sequence ID:  1N82
No mismatches in pdb sequence and pulled sequence:  1N82
Iterations:  2417
Sequence ID:  1E6U
No mismatches in pdb sequence and pulled sequence:  1E6U
Iterations:  2418
Sequence ID:  1LMB
No mismatches in pdb sequence and pulled sequence:  1LMB
Iterations:  2419
Sequence ID:  1S1G
No mismatches in pdb sequence and pulled sequence:  1S1G
Ite

No mismatches in pdb sequence and pulled sequence:  1UZC
Iterations:  2499
Sequence ID:  1R42
No mismatches in pdb sequence and pulled sequence:  1R42
Iterations:  2500
Sequence ID:  1MR3
No mismatches in pdb sequence and pulled sequence:  1MR3
Iterations:  2501
Sequence ID:  1G0S
No mismatches in pdb sequence and pulled sequence:  1G0S
Iterations:  2502
Sequence ID:  1TET
No mismatches in pdb sequence and pulled sequence:  1TET
Iterations:  2503
Sequence ID:  2BX2
No mismatches in pdb sequence and pulled sequence:  2BX2
Iterations:  2504
Sequence ID:  1NOA
No mismatches in pdb sequence and pulled sequence:  1NOA
Iterations:  2505
Sequence ID:  1NUE
No mismatches in pdb sequence and pulled sequence:  1NUE
Iterations:  2506
Sequence ID:  1JMM
No mismatches in pdb sequence and pulled sequence:  1JMM
Iterations:  2507
Sequence ID:  1BWO
No mismatches in pdb sequence and pulled sequence:  1BWO
Iterations:  2508
Sequence ID:  1DPQ
No mismatches in pdb sequence and pulled sequence:  1DPQ
Ite

No mismatches in pdb sequence and pulled sequence:  1PFJ
Iterations:  2590
Sequence ID:  1XBI
No mismatches in pdb sequence and pulled sequence:  1XBI
Iterations:  2591
Sequence ID:  1TUJ
No mismatches in pdb sequence and pulled sequence:  1TUJ
Iterations:  2592
Sequence ID:  1XFL
No mismatches in pdb sequence and pulled sequence:  1XFL
Iterations:  2593
Sequence ID:  1SWG
No mismatches in pdb sequence and pulled sequence:  1SWG
Iterations:  2594
Sequence ID:  1WDG
No mismatches in pdb sequence and pulled sequence:  1WDG
Iterations:  2595
Sequence ID:  1Z2Q
No mismatches in pdb sequence and pulled sequence:  1Z2Q
Iterations:  2596
Sequence ID:  2G2C
No mismatches in pdb sequence and pulled sequence:  2G2C
Iterations:  2597
Sequence ID:  1KIL
No mismatches in pdb sequence and pulled sequence:  1KIL
Iterations:  2598
Sequence ID:  1O60
No mismatches in pdb sequence and pulled sequence:  1O60
Iterations:  2599
Sequence ID:  1KMI
No mismatches in pdb sequence and pulled sequence:  1KMI
Ite

No mismatches in pdb sequence and pulled sequence:  1UST
Iterations:  2679
Sequence ID:  1PI8
No mismatches in pdb sequence and pulled sequence:  1PI8
Iterations:  2680
Sequence ID:  1J5U
No mismatches in pdb sequence and pulled sequence:  1J5U
Iterations:  2681
Sequence ID:  1YZV
No mismatches in pdb sequence and pulled sequence:  1YZV


## Calculating Expected Contact Sites from PDB structure file

In [None]:
# parser = PDBParser()
# structure = parser.get_structure(pdb_id, f"{structure_dir+pdb_id}.pdb")

# # Extract desired protein structure from PDB structure (typically only 1 structure to choose from)
# protein_structure = structure[0]


# for chain in protein_structure:
#     print("Chain:", chain.id)  # Reset position for each chain

#     for residue in chain:
# #         print(residue.id)
#         if residue_position >= len(protein_data[pdb_id]):
#             print("Sequence length exceeded at chain", chain.id)
#             break  # Break the loop if the position exceeds the sequence length
#         try:
#             if simple_aa(residue.resname) != protein_data[pdb_id][residue_position]:
#                 print("Mismatch at position:", residue.id[1])
#                 break
#             else:
#                 if simple_aa(residue.resname) == None:
#                     continue
#                 else:
#                     print(residue.id[1], simple_aa(residue.resname), protein_data[pdb_id][residue_position], residue['CA'])
#                 residue_position += 1
            
#         except KeyError:
#             # Specific exception handling
#             continue
#     print(residue_position)

In [17]:
def calc_contact_sites(pdb_id):
    structure = parser.get_structure(pdb_id, f"{structure_dir}/{pdb_id}.pdb")  # Ensure correct path joining
    protein_structure = structure[0]
    chain = protein_structure['A']

    # Initialize count variable
    count = 0

    for i, residue1 in enumerate(chain):
        for j, residue2 in enumerate(chain):
            if i <= j:
                continue # Avoids redundant comparisons and self-comparison
            if residue1.id[1] > len(protein_data[pdb_id]) or residue2.id[1] > len(protein_data[pdb_id]):
                continue
            try:
                distance = abs(residue1['CA'] - residue2['CA'])
            except KeyError:
                continue
            aa_distance = abs(residue1.id[1] - residue2.id[1])
            if distance < 5:
                if aa_distance > 2:
    #                 print(residue1.id[1], residue1.resname, residue2.id[1], residue2.resname, distance)
                    in_contact_sites[pdb_id].append({
                        'res_1': residue1.id[1], 
                        'res_2': residue2.id[1], 
                        'sig_1': simple_aa(residue1.resname), 
                        'sig_2': simple_aa(residue2.resname),
                        'aa_dist': aa_distance,
                        'arn_dist': distance,
                        'in_contact': True
                    })
                    count += 1
            else:
                if aa_distance > 2:
                    non_contact_sites[pdb_id].append({
                        'res_1': residue1.id[1], 
                        'res_2': residue2.id[1], 
                        'sig_1': simple_aa(residue1.resname), 
                        'sig_2': simple_aa(residue2.resname),
                        'aa_dist': aa_distance,
                        'arn_dist': distance,
                        'in_contact': False
                    })

    if non_contact_sites[pdb_id]:
        subset_non_contact_sites[pdb_id] = random.sample(non_contact_sites[pdb_id], min(len(non_contact_sites[pdb_id]), len(in_contact_sites[pdb_id])))

    # Optionally print or process the results
    return f"Total contacts found {pdb_id}: {count}"

In [18]:
in_contact_sites = defaultdict(list)
non_contact_sites = defaultdict(list)
subset_non_contact_sites = defaultdict(list)

iterations = 0

for pdb_id in same_sequence_ids:
    print(calc_contact_sites(pdb_id))
    print(len(in_contact_sites[pdb_id]), len(subset_non_contact_sites[pdb_id]))
    print("Iterations: ", iterations)
    iterations+=1

Total contacts found 1UE8: 88
88 88
Iterations:  0
Total contacts found 1F8E: 103
103 103
Iterations:  1
Total contacts found 1J5B: 17
17 17
Iterations:  2
Total contacts found 1P0Z: 32
32 32
Iterations:  3
Total contacts found 1N3K: 44
44 44
Iterations:  4
Total contacts found 1JDM: 8
8 8
Iterations:  5
Total contacts found 1NSH: 24
24 24
Iterations:  6
Total contacts found 1Y5O: 33
33 33
Iterations:  7
Total contacts found 2A55: 24
24 24
Iterations:  8
Total contacts found 1RDG: 13
13 13
Iterations:  9
Total contacts found 1XNE: 35
35 35
Iterations:  10
Total contacts found 1AOY: 14
14 14
Iterations:  11
Total contacts found 1XW3: 11
11 11
Iterations:  12
Total contacts found 1G12: 49
49 49
Iterations:  13
Total contacts found 2BCW: 13
13 13
Iterations:  14
Total contacts found 1CTJ: 23
23 23
Iterations:  15
Total contacts found 1R5S: 12
12 12
Iterations:  16
Total contacts found 1MWW: 29
29 29
Iterations:  17
Total contacts found 1SNB: 25
25 25
Iterations:  18
Total contacts found 1

Total contacts found 2PFK: 94
94 94
Iterations:  160
Total contacts found 1SS3: 9
9 9
Iterations:  161
Total contacts found 1JZU: 71
71 71
Iterations:  162
Total contacts found 1NPC: 109
109 109
Iterations:  163
Total contacts found 1K8R: 44
44 44
Iterations:  164
Total contacts found 1EW3: 39
39 39
Iterations:  165
Total contacts found 1VB7: 36
36 36
Iterations:  166
Total contacts found 1ILO: 28
28 28
Iterations:  167
Total contacts found 1V38: 18
18 18
Iterations:  168
Total contacts found 1ZLB: 34
34 34
Iterations:  169
Total contacts found 1DOS: 99
99 99
Iterations:  170
Total contacts found 1ZRX: 14
14 14
Iterations:  171
Total contacts found 1WL7: 95
95 95
Iterations:  172
Total contacts found 1LKK: 0
0 0
Iterations:  173
Total contacts found 1WH2: 18
18 18
Iterations:  174
Total contacts found 2CW1: 21
21 21
Iterations:  175
Total contacts found 1X8D: 27
27 27
Iterations:  176
Total contacts found 2CS4: 20
20 20
Iterations:  177
Total contacts found 1UEZ: 33
33 33
Iterations:  

Total contacts found 1X1M: 19
19 19
Iterations:  321
Total contacts found 1XC0: 14
14 14
Iterations:  322
Total contacts found 1PC6: 25
25 25
Iterations:  323
Total contacts found 1I3Z: 28
28 28
Iterations:  324
Total contacts found 2COK: 46
46 46
Iterations:  325
Total contacts found 1V5X: 56
56 56
Iterations:  326
Total contacts found 2GJ0: 11
11 11
Iterations:  327
Total contacts found 1XZ9: 48
48 48
Iterations:  328
Total contacts found 1CCD: 15
15 15
Iterations:  329
Total contacts found 1G70: 0
0 0
Iterations:  330
Total contacts found 1XK5: 38
38 38
Iterations:  331
Total contacts found 1AJ2: 60
60 60
Iterations:  332
Total contacts found 1V61: 31
31 31
Iterations:  333
Total contacts found 1A05: 130
130 130
Iterations:  334
Total contacts found 1IFK: 15
15 15
Iterations:  335
Total contacts found 1YSN: 27
27 27
Iterations:  336
Total contacts found 1E29: 37
37 37
Iterations:  337
Total contacts found 2FQ9: 51
51 51
Iterations:  338
Total contacts found 1ZXG: 24
24 24
Iterations

Total contacts found 1WJ5: 27
27 27
Iterations:  478
Total contacts found 1LWM: 23
23 23
Iterations:  479
Total contacts found 1O08: 0
0 0
Iterations:  480
Total contacts found 1J0P: 25
25 25
Iterations:  481
Total contacts found 1V6S: 117
117 117
Iterations:  482
Total contacts found 1ANU: 48
48 48
Iterations:  483
Total contacts found 2BIS: 137
137 137
Iterations:  484
Total contacts found 2B3T: 71
71 71
Iterations:  485
Total contacts found 2CRA: 7
7 7
Iterations:  486
Total contacts found 1FHF: 47
47 47
Iterations:  487
Total contacts found 1EQ1: 49
49 49
Iterations:  488
Total contacts found 1GMU: 45
45 45
Iterations:  489
Total contacts found 1QTF: 80
80 80
Iterations:  490
Total contacts found 1WXP: 37
37 37
Iterations:  491
Total contacts found 1NQ6: 91
91 91
Iterations:  492
Total contacts found 1BJJ: 38
38 38
Iterations:  493
Total contacts found 1RYU: 30
30 30
Iterations:  494
Total contacts found 1XWI: 43
43 43
Iterations:  495
Total contacts found 1KT6: 42
42 42
Iterations

Total contacts found 1FLM: 32
32 32
Iterations:  639
Total contacts found 1WMW: 77
77 77
Iterations:  640
Total contacts found 1V6F: 41
41 41
Iterations:  641
Total contacts found 1IPG: 35
35 35
Iterations:  642
Total contacts found 1U55: 49
49 49
Iterations:  643
Total contacts found 1KZX: 45
45 45
Iterations:  644
Total contacts found 1HJ8: 70
70 70
Iterations:  645
Total contacts found 1XT5: 42
42 42
Iterations:  646
Total contacts found 1LQB: 32
32 32
Iterations:  647
Total contacts found 1RVV: 51
51 51
Iterations:  648
Total contacts found 2SAK: 24
24 24
Iterations:  649
Total contacts found 1NJQ: 4
4 4
Iterations:  650
Total contacts found 1SN6: 23
23 23
Iterations:  651
Total contacts found 1SP7: 1
1 1
Iterations:  652
Total contacts found 2C0H: 90
90 90
Iterations:  653
Total contacts found 1QHV: 0
0 0
Iterations:  654
Total contacts found 1YHP: 65
65 65
Iterations:  655
Total contacts found 1GKA: 50
50 50
Iterations:  656
Total contacts found 1S04: 44
44 44
Iterations:  657
To

Total contacts found 2COE: 36
36 36
Iterations:  796
Total contacts found 1X2N: 11
11 11
Iterations:  797
Total contacts found 1MZH: 31
31 31
Iterations:  798
Total contacts found 1UDC: 102
102 102
Iterations:  799
Total contacts found 1U5L: 0
0 0
Iterations:  800
Total contacts found 1VDB: 3
3 3
Iterations:  801
Total contacts found 1S6W: 3
3 3
Iterations:  802
Total contacts found 2B38: 8
8 8
Iterations:  803
Total contacts found 1QXN: 37
37 37
Iterations:  804
Total contacts found 1RA9: 37
37 37
Iterations:  805
Total contacts found 1IQS: 20
20 20
Iterations:  806
Total contacts found 1T12: 36
36 36
Iterations:  807
Total contacts found 1TYK: 8
8 8
Iterations:  808
Total contacts found 1PPO: 73
73 73
Iterations:  809
Total contacts found 256B: 16
16 16
Iterations:  810
Total contacts found 1DGF: 118
118 118
Iterations:  811
Total contacts found 1BKP: 65
65 65
Iterations:  812
Total contacts found 1WYJ: 34
34 34
Iterations:  813
Total contacts found 1EMX: 11
11 11
Iterations:  814
To

Total contacts found 1T17: 48
48 48
Iterations:  953
Total contacts found 1GD0: 31
31 31
Iterations:  954
Total contacts found 1AC5: 142
142 142
Iterations:  955
Total contacts found 1UP9: 21
21 21
Iterations:  956
Total contacts found 1PPQ: 0
0 0
Iterations:  957
Total contacts found 1MBM: 72
72 72
Iterations:  958
Total contacts found 1VA4: 84
84 84
Iterations:  959
Total contacts found 1A92: 7
7 7
Iterations:  960
Total contacts found 1AUN: 66
66 66
Iterations:  961
Total contacts found 1WHX: 0
0 0
Iterations:  962
Total contacts found 1V7L: 66
66 66
Iterations:  963
Total contacts found 1VE1: 94
94 94
Iterations:  964
Total contacts found 1UFB: 36
36 36
Iterations:  965
Total contacts found 1X4Q: 28
28 28
Iterations:  966
Total contacts found 1NBL: 21
21 21
Iterations:  967
Total contacts found 1F07: 110
110 110
Iterations:  968
Total contacts found 1RKB: 32
32 32
Iterations:  969
Total contacts found 1S4V: 69
69 69
Iterations:  970
Total contacts found 1Z99: 13
13 13
Iterations:  

Total contacts found 1X5E: 28
28 28
Iterations:  1110
Total contacts found 1UGV: 23
23 23
Iterations:  1111
Total contacts found 1EGZ: 78
78 78
Iterations:  1112
Total contacts found 2A07: 0
0 0
Iterations:  1113
Total contacts found 1J32: 105
105 105
Iterations:  1114
Total contacts found 2CRQ: 21
21 21
Iterations:  1115
Total contacts found 1Y0M: 0
0 0
Iterations:  1116
Total contacts found 1JM0: 10
10 10
Iterations:  1117
Total contacts found 1F2J: 108
108 108
Iterations:  1118
Total contacts found 1WFZ: 34
34 34
Iterations:  1119
Total contacts found 2HRV: 45
45 45
Iterations:  1120
Total contacts found 1YW9: 61
61 61
Iterations:  1121
Total contacts found 1K1V: 3
3 3
Iterations:  1122
Total contacts found 2AIB: 14
14 14
Iterations:  1123
Total contacts found 1UG2: 20
20 20
Iterations:  1124
Total contacts found 1BWM: 55
55 55
Iterations:  1125
Total contacts found 1MG8: 34
34 34
Iterations:  1126
Total contacts found 2MCM: 38
38 38
Iterations:  1127
Total contacts found 1MV4: 0
0 

In [19]:
len(in_contact_sites)

1215

## Initialize Contact Dictionaries for Training Random Forest Classifier

In [None]:
# from collections import defaultdict

# # Assuming protein_structure and protein_sequence are defined

# contacts = defaultdict(dict)

# # Mapping of chain IDs to starting index
# chain_to_index = {'A': -1, 'B': 275, 'P': 375}

# count = 0


# ### Contacts from the SAME chain are only being considered

# for chain in protein_structure:
# #     print(chain.id)
#     index = chain_to_index.get(chain.id, 0)  # Default to 0 if chain ID not found

#     for residue1 in chain:
#         for residue2 in chain:
#             if residue1 != residue2:
#                 try:
#                     # Calculate Alpha-Carbon Distance
#                     distance = abs(residue1['CA'] - residue2['CA'])
#                 except KeyError:
#                     continue

#                 # Calculating Distance of Amino Acids in Sequence
#                 diff = abs(residue1.id[1] - residue2.id[1])
#                 if diff > 2:
#                     res1_index = residue1.id[1] + index
#                     res2_index = residue2.id[1] + index

#                     # Ensure indices are within the bounds of protein_sequence
#                     if 0 <= res1_index < len(protein_sequence) and 0 <= res2_index < len(protein_sequence):
#                         contact = distance < 5
#                         contacts[(res1_index, res2_index)] = {
#                             'aa1': protein_sequence[res1_index],
#                             'aa2': protein_sequence[res2_index], 
#                             'dist': distance,
#                             'contact': contact
#                         }
#                         if contact:
#                             count += 1
# #                         print(res1_index, residue1.resname, res2_index, residue2.resname, distance, protein_sequence[res1_index], protein_sequence[res2_index], contact)

# print(f"Total contacts: {count}")


In [None]:
                        'aa_dist': aa_distance,
                        'arn_dist': distance,

In [21]:
# Initialize contact_data as a defaultdict of lists
contact_data = defaultdict(list)

# Add data from in_contact_sites
for pdb_id, contacts in in_contact_sites.items():
    for contact in contacts:
        contact_data[pdb_id].append({
            'res_1': contact['res_1'],
            'res_2': contact['res_2'],
            'sig_1': contact['sig_1'],
            'sig_2': contact['sig_2'],
            'aa_dist': contact['aa_dist'],
            'arn_dist': contact['arn_dist'],
            'in_contact': contact['in_contact']
        })

# Add data from subset_non_contact_sites
for pdb_id, non_contacts in subset_non_contact_sites.items():
    for non_contact in non_contacts:
        contact_data[pdb_id].append({
            'res_1': non_contact['res_1'],
            'res_2': non_contact['res_2'],
            'sig_1': non_contact['sig_1'],
            'sig_2': non_contact['sig_2'],
            'aa_dist': contact['aa_dist'],
            'arn_dist': contact['arn_dist'],
            'in_contact': non_contact['in_contact']
        })

# contact_data is now a defaultdict containing all the data from both dictionaries


In [30]:
for k,v in list(contact_data.items())[:4]:
    print(k)
    for i in v:
        if i['aa_dist'] > 12:
            print(i)

1UE8
{'res_1': 70, 'res_2': 41, 'sig_1': 'T', 'sig_2': 'S', 'aa_dist': 29, 'arn_dist': 4.8687644, 'in_contact': True}
{'res_1': 71, 'res_2': 42, 'sig_1': 'S', 'sig_2': 'N', 'aa_dist': 29, 'arn_dist': 4.955557, 'in_contact': True}
{'res_1': 185, 'res_2': 132, 'sig_1': 'D', 'sig_2': 'G', 'aa_dist': 53, 'arn_dist': 4.117061, 'in_contact': True}
{'res_1': 186, 'res_2': 131, 'sig_1': 'L', 'sig_2': 'L', 'aa_dist': 55, 'arn_dist': 4.741112, 'in_contact': True}
{'res_1': 202, 'res_2': 63, 'sig_1': 'G', 'sig_2': 'P', 'aa_dist': 139, 'arn_dist': 4.392543, 'in_contact': True}
{'res_1': 202, 'res_2': 64, 'sig_1': 'G', 'sig_2': 'T', 'aa_dist': 138, 'arn_dist': 4.7936673, 'in_contact': True}
{'res_1': 212, 'res_2': 123, 'sig_1': 'N', 'sig_2': 'P', 'aa_dist': 89, 'arn_dist': 4.497166, 'in_contact': True}
{'res_1': 216, 'res_2': 119, 'sig_1': 'T', 'sig_2': 'A', 'aa_dist': 97, 'arn_dist': 4.7375617, 'in_contact': True}
{'res_1': 261, 'res_2': 39, 'sig_1': 'T', 'sig_2': 'F', 'aa_dist': 222, 'arn_dist': 

In [None]:

# # Assuming contact_data is defined and is a list of dictionaries
# total_data = same_sequence_ids

# # First split: Splitting into 80% for training and 20% for the remaining
# train_data, remaining_data = train_test_split(total_data, test_size=0.20, random_state=42)

# # Second split: Splitting the remaining 20% into 15% test and 5% validation of the original data
# # This corresponds to 75% (test) and 25% (validation) of the remaining 20% data
# test_data, validation_data = train_test_split(remaining_data, test_size=0.25, random_state=42)

# # Print the sizes to verify
# print(f"Training set size: {len(train_data)}")
# print(f"Test set size: {len(test_data)}")
# print(f"Validation set size: {len(validation_data)}")


## Generate ESM-2 Embedding

In [33]:
# 1. Load the ESM Model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model.eval()

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_after): LayerNorm((1280,), eps=1

In [34]:
# Set model to use cuda GPU
if torch.cuda.is_available():
    model = model.cuda()

In [35]:
# Index protein sequence as sequence 0 (next sequence would be indexed as 1)

def generate_embeddings(pdb_id):
    protein_sequence = protein_data[pdb_id]
    esm_input_data = [(0, protein_sequence)]
    # print('Data: ', esm_input_data, '\n')

    # Prepare variables to input sequence into ESM-2 model 
    batch_converter = alphabet.get_batch_converter()
    batch_labels, batch_strs, batch_tokens = batch_converter(esm_input_data)
    batch_tokens = batch_tokens.cuda() if torch.cuda.is_available() else batch_tokens

    # print('batch_tokens: ', '\n\n', batch_tokens, '\n')

    # 4. Input prepared sequence information into model and output as results (contact predictions are included in embedding)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)

    return results['attentions']

In [None]:
# attention_data = {}
# iterations = 0
# for pdb_id in train_data[:300]:
#     attention_data[pdb_id] = generate_embeddings(pdb_id)
#     print("Iterations: ", iterations)
#     iterations+=1

## Implementing Random Forest Classifier (Currently for only 1 sequence)

In [36]:
# Extract attentions from all heads and layers for given amino acid residues

def get_x_y(attention_data, res_1, res_2):
    vectors = []
    for layer in range(0,33):
        for head in range(0,20):
            vectors.append(attention_data[0][layer][head][res_1][res_2])

    return vectors


In [None]:
# # Extract attentions from all heads and layers for given amino acid residues

# def get_x_y(pdb_id, res_1, res_2):
#     vectors = []
#     for layer in range(0,33):
#         for head in range(0,20):
#             vectors.append(attention_data[pdb_id][0][layer][head][res_1][res_2])

#     return vectors


In [None]:
# for pdb_id in train_data:    
#     structure = parser.get_structure(pdb_id, f"{structure_dir}/{pdb_id}.pdb")  # Ensure correct path joining
#     protein_structure = structure[0]
#     chain = protein_structure['A']
#     print(list(chain.get_residues())[0].id[1])

In [None]:
# X_train = []
# y_train = []

# for pdb_id in train_data:
#     structure = parser.get_structure(pdb_id, f"{structure_dir}/{pdb_id}.pdb")  # Ensure correct path joining
#     protein_structure = structure[0]
#     chain = protein_structure['A']
#     first_residue = list(chain.get_residues())[0].id[1]
#     if first_residue == 1:
#         for i in (contact_data[pdb_id]):
#             if pdb_id in attention_data.keys():
#                 X_train.append(get_x_y(pdb_id, i['res_1'], i['res_2']))
#                 y_train.append(i['in_contact'])
#         else:
#             continue

In [43]:
seed_value = 64
random.seed(seed_value)
n_sequences = 30

selected_sequence_ids = random.sample(same_sequence_ids, n_sequences)

print(selected_sequence_ids[:5])


['1H9O', '1I8U', '1UM0', '1ZVN', '1Z14']


In [38]:
class DataPoint:
    def __init__(self, id, X, y):
        self.id = id
        self.X = X
        self.y = y

In [44]:
ids = []
X = []
y = []
data_points = []  # List to hold DataPoint objects
iterations = 0

for pdb_id in selected_sequence_ids:
    structure = parser.get_structure(pdb_id, f"{structure_dir}/{pdb_id}.pdb")  # Ensure correct path joining
    protein_structure = structure[0]
    chain = protein_structure['A']
    first_residue = list(chain.get_residues())[0].id[1]
    print('Iteration: ', iterations)
    iterations+=1
    if first_residue == 1:
        attention_data = generate_embeddings(pdb_id)
        for i in contact_data[pdb_id]:
            id = [pdb_id, i['res_1'], i['res_2']]
            x = get_x_y(attention_data, i['res_1'], i['res_2'])
            data_points.append(DataPoint(id, x, i['in_contact']))
    else:
        continue

Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29


In [45]:
train_data, test_data = train_test_split(data_points, test_size=0.2, random_state=42)

In [54]:
for i in contact_data['1SIS']:
    if i['res_1'] == 24 and i['res_2'] == 12:
        print(i)

{'res_1': 24, 'res_2': 12, 'sig_1': 'G', 'sig_2': 'M', 'aa_dist': 10, 'arn_dist': 4.3468437, 'in_contact': False}


In [53]:
# Create and train the RandomForest model
model = RandomForestClassifier(random_state=42)
model.fit(train_X, train_y)

# Extract X values from the test_data
test_X = [dp.X for dp in test_data]
test_y = [dp.y for dp in test_data]

# Make predictions on the test set using the extracted features
predictions = model.predict(test_X)

# Output the ids along with their predictions
for id, pred, value in zip(test_ids, predictions, test_y):
    if pred != value:
        print(f"ID: {id}, Prediction: {pred}, True Val: {value}")


ID: ['1SIS', 24, 12], Prediction: True, True Val: False
ID: ['1WGQ', 98, 31], Prediction: True, True Val: False
ID: ['2FMC', 56, 45], Prediction: True, True Val: False
ID: ['2FMC', 50, 5], Prediction: False, True Val: True
ID: ['1FH3', 65, 11], Prediction: False, True Val: True
ID: ['1FH3', 50, 3], Prediction: False, True Val: True
ID: ['2AIV', 64, 36], Prediction: False, True Val: True
ID: ['1ZRX', 37, 27], Prediction: True, True Val: False
ID: ['1NPC', 106, 47], Prediction: False, True Val: True
ID: ['1FH3', 58, 6], Prediction: False, True Val: True
ID: ['1UM0', 294, 252], Prediction: True, True Val: False
ID: ['1F53', 47, 4], Prediction: True, True Val: False
ID: ['1NPC', 174, 143], Prediction: False, True Val: True
ID: ['2FMC', 52, 16], Prediction: True, True Val: False
ID: ['1ZNU', 17, 9], Prediction: False, True Val: True
ID: ['1XNE', 71, 25], Prediction: True, True Val: False
ID: ['1ZNU', 29, 1], Prediction: False, True Val: True
ID: ['1VYX', 55, 24], Prediction: False, True Val

In [24]:
ids = []
X = []
y = []
iterations = 0
selected_sequence_ids = same_sequence_ids

for pdb_id in selected_sequence_ids:
    structure = parser.get_structure(pdb_id, f"{structure_dir}/{pdb_id}.pdb")  # Ensure correct path joining
    protein_structure = structure[0]
    chain = protein_structure['A']
    first_residue = list(chain.get_residues())[0].id[1]
    print('Iteration: ', iterations)
    iterations+=1
    if first_residue == 1:
        attention_data = generate_embeddings(pdb_id)
        for i in contact_data[pdb_id]:
                ids.append([pdb_id,i['res_1'], i['res_2']])
                X.append(get_x_y(attention_data, i['res_1'], i['res_2']))
                y.append(i['in_contact'])
        else:
            continue



Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Itera

Iteration:  519
Iteration:  520
Iteration:  521
Iteration:  522
Iteration:  523
Iteration:  524
Iteration:  525
Iteration:  526
Iteration:  527
Iteration:  528
Iteration:  529
Iteration:  530
Iteration:  531
Iteration:  532
Iteration:  533
Iteration:  534
Iteration:  535
Iteration:  536
Iteration:  537
Iteration:  538
Iteration:  539
Iteration:  540
Iteration:  541
Iteration:  542
Iteration:  543
Iteration:  544
Iteration:  545
Iteration:  546
Iteration:  547
Iteration:  548
Iteration:  549
Iteration:  550
Iteration:  551
Iteration:  552
Iteration:  553
Iteration:  554
Iteration:  555
Iteration:  556
Iteration:  557
Iteration:  558
Iteration:  559
Iteration:  560
Iteration:  561
Iteration:  562
Iteration:  563
Iteration:  564
Iteration:  565
Iteration:  566
Iteration:  567
Iteration:  568
Iteration:  569
Iteration:  570
Iteration:  571
Iteration:  572
Iteration:  573
Iteration:  574
Iteration:  575
Iteration:  576
Iteration:  577
Iteration:  578
Iteration:  579
Iteration:  580
Iteratio

KeyboardInterrupt: 

## Grid Search CV

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.4, random_state=55)

In [26]:
len(X_train)

20977

In [None]:
clf = RandomForestClassifier(criterion='gini', max_depth=8,max_features='sqrt',n_estimators = 200,random_state=42)
# clf = RandomForestClassifier(random_state=55, n_estimators = 500, max_features = 'sqrt', max_depth = 8, criterion = 'log_loss')

In [None]:
# param_grid = { 
#     'n_estimators': [200, 500],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4,5,6,7,8],
#     'criterion' :['gini', 'entropy', 'log_loss']
# }

In [None]:
# param_grid = { 
#     'n_estimators': [200, 500],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4,5,6,7,8],
#     'criterion' :['gini']
# }

In [None]:
# CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5, verbose=1)
# CV_clf.fit(X_train, y_train)

In [None]:
results = CV_clf.cv_results_


In [None]:
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"Accuracy: {mean_score:.4f} for parameters: {params}")


In [None]:
print(results)

In [None]:
CV_clf.best_params_


In [None]:
df = pd.DataFrame(est.cv_results_)

In [None]:
# clf=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 200, max_depth=8, criterion='gini')

In [None]:
# rfc1.fit(x_train, y_train)
# pred=rfc1.predict(x_test)


In [None]:
# print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))

In [None]:
# splits = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]
# train_errors = []
# val_errors = []

# for i in splits:
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=0)
#     clf = RandomForestClassifier()
#     clf.fit(X_train, y_train)
    
#     # Record the training and validation errors
#     train_errors.append(1 - clf.score(X_train, y_train))
#     val_errors.append(1 - clf.score(X_test, y_test))  

In [None]:
# # Plotting the error rates
# plt.figure(figsize=(10, 5))
# # plt.plot(splits, train_errors, label='Training Error')
# plt.plot(splits, val_errors, label='Error Rate', linestyle='--')
# plt.xlabel('Test Split Size')
# plt.ylabel('Error Rate')
# plt.title('Random Forest Error Rate vs. Test Split Size')
# plt.legend()
# plt.show()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# X_train = []
# y_train = []
# iterations = 0

# for pdb_id in train_data:
#     structure = parser.get_structure(pdb_id, f"{structure_dir}/{pdb_id}.pdb")  # Ensure correct path joining
#     protein_structure = structure[0]
#     chain = protein_structure['A']
#     first_residue = list(chain.get_residues())[0].id[1]
#     print('Iteration: ', iterations)
#     iterations+=1
#     if first_residue == 1:
#         attention_data = generate_embeddings(pdb_id)
#         for i in contact_data[pdb_id]:
#                 X_train.append(get_x_y(attention_data, i['res_1'], i['res_2']))
#                 y_train.append(i['in_contact'])
#         else:
#             continue



In [None]:
# # Fit classifier

# clf = RandomForestClassifier(max_depth=2, random_state=0)
# clf.fit(X_train, y_train)

In [None]:
# for pdb_id in test_data[:100]:
#     attention_data[pdb_id] = generate_embeddings(pdb_id)
#     print("Iterations: ", iterations)
#     iterations+=1

In [None]:
# X_test = []
# y_test = []
# iterations = 0

# for pdb_id in test_data:
#     structure = parser.get_structure(pdb_id, f"{structure_dir}/{pdb_id}.pdb")  # Ensure correct path joining
#     protein_structure = structure[0]
#     chain = protein_structure['A']
#     first_residue = list(chain.get_residues())[0].id[1]
#     print('Iteration: ', iterations)
#     iterations+=1
#     if first_residue == 1:
#         attention_data = generate_embeddings(pdb_id)
#         for i in contact_data[pdb_id]:
#                 X_test.append(get_x_y(attention_data, i['res_1'], i['res_2']))
#                 y_test.append(i['in_contact'])
#         else:
#             continue

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive']).plot()


In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

In [None]:
## Find X vectors that are supposed to be True

# count = 0

# for vec,cont in zip(X,y):
#     count+=1
#     if cont == True:
#         print(count)
#         print(vec)