## **PDB-CAT**

#### **You have the option to explore the code in the following two cells or jump directly into the main code. In the main code section, you should identify and determine the necessary variables.**

In [5]:
# Import libraries
from PDBCAT_module import *
from pdbecif.mmcif_io import CifFileReader
from pdbecif.mmcif_tools import MMCIF2Dict
import pandas as pd
import time
import re
import os
import shutil
from Bio.Align import PairwiseAligner 
from Bio.PDB import * 
from datetime import datetime


In [6]:
"""
=========
INITIAL INFORMATION. CHANGE THE CONTENT OF THESE VARIABLES IF NECESSARY
"""
information_df ='df.csv' # Path and name of the csv output file
mutation = True   # Analyze mutations. True or False
output_path = os.getcwd() + '/output/'
pdb_reference_sequence = '5r7y.cif' # Path to the pdb file that will be the reference sequence. 
entity_reference = 0 # '0' means that the first _entity_poly of the pdb_reference_sequence will be the reference sequence
reference_seq=''
res_threshold = 15 # Chose a threshold to discriminate between peptides and the subunits of the protein

""" 
MAIN CODE
"""
directory_path =os.getcwd()+"/cif/"  # Path to the folder with the cif files to process
blacklist= read_blacklist("./blacklist.txt") # Path to the blacklist file that contain the codes of the small molecules not considered ligands

# READ THE REFERENCE SEQUENCE. It is a PDB file in CIF format.
if mutation == True:
    ref_cfr = CifFileReader()
    ref_cif_obj = ref_cfr.read(pdb_reference_sequence, output='cif_wrapper', ignore=['_atom_site'])
    ref_cif_data = list(ref_cif_obj.values())[0]
    if '_entity_poly' in ref_cif_data and 'pdbx_seq_one_letter_code_can' in ref_cif_data['_entity_poly']:
        reference_seq = ref_cif_data['_entity_poly']['pdbx_seq_one_letter_code_can'][entity_reference]  
        reference_seq = reference_seq.replace("\n", "")
    
data = [] # It will contain the information for each PDB file
for filename in os.listdir(directory_path):
    if filename.endswith('.cif'):
        file_path = os.path.join(directory_path, filename)
        data_from_file = process_cif_file(file_path, mutation, blacklist, reference_seq, res_threshold)
        data.append(data_from_file)
        
df = pd.DataFrame(data) # Create a Pandas df
df.to_csv(information_df, index=False) # Save the df into a file
        
# Classification
if mutation == False:
    no_mutated_list = [filename[:-4].upper() for filename in os.listdir(directory_path) if filename.endswith('.cif')]

if mutation ==True:    
    # Classify whether there is a mutation
    no_mutated_list, non_mut_path = mutation_classification(information_df, output_path)
    output_path = non_mut_path 

# Classify depend on the bond
bond_classification(information_df, no_mutated_list, output_path, mutation)

['7TQ5', '7LKE', '5RF1', '7BAK', '5RE4', '6XHU', '5R7Y', '8GXH', '5REB', '5REK', '6YNQ', '5RGI', '7AEG', '7Z4S', '7A1U', '6W63', '7JR4', '7WQ9', '7P51', '6LU7', '7WQ8', '7NIJ', '5RF2', '7ANS', '5REL', '6XB0', '7AQE', '5R8T', '7AP6', '7BAJ', '6XA4', '6M2N', '5REJ', '8GVY', '5RG1', '7RNW', '5R80', '7VVT', '7BAL', '7L14']
