# **PDB-CAT**
#### **You have the option to explore the code in the following cells or jump directly into the Settings. In the settings, everything is set by default.**

If you are using the mutation filter, fill the variable `reference `
with the name of the file, and change `mutation` to `True`.

i.e.

```
reference = "1ABC.cif"
mutation = True
```



## Environment setup

In [1]:
import os
# Check if the PDB-CAT repository has been cloned and installed
if not os.path.isfile("PDB-CAT_READY"):
    os.system("git clone https://github.com/URV-cheminformatics/PDB-CAT.git")
    os.chdir("PDB-CAT")  # Change directory to the cloned repository
    os.system("pip install -r requirements.txt")  # Install PDB-CAT if it has a setup.py or pyproject.toml
    os.chdir("..")  # Change back to the original directory
    os.system("touch PDB-CAT_READY")  # Create the PDB-CAT_READY file to indicate successful cloning and installation
print("PDB-CAT installed")

PDB-CAT installed


In [15]:
github = 'PDB-CAT/'

def ensure_directories():
    cif_dir = os.path.join(github + "/cif")
    out_dir = os.path.join(github + "/out")

    # Check if the 'cif' directory exists, if not, create it
    if not os.path.exists(cif_dir):
        os.mkdir(cif_dir)

    # Check if the 'out' directory exists, if not, create it
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    print("Directories ensured")

ensure_directories()

Directories ensured


In [13]:
# Import libraries

from pdbecif.mmcif_io import CifFileReader
from pdbecif.mmcif_tools import MMCIF2Dict
import pandas as pd
import time
import re
import shutil
from Bio.Align import PairwiseAligner
from Bio.PDB import *
from datetime import datetime
from google.colab import files
import sys
sys.path.append('/content/PDB-CAT')
from PDBCAT_module import *


## 👈 Add your files in cif folder

## Settings

In [19]:
"""
=========
INITIAL INFORMATION. CHANGE THE CONTENT OF THESE VARIABLES IF NECESSARY
"""
reference = ''

directory_path = github + "cif/"                      # Path to the folder with the cif files to process
out_file = github + "df.csv"                          # Path and name of the FIRST csv output file (protein-centered) (.csv)
out_file_ligands = github + "df_ligands.csv"          # Path and name of the SECOND csv output file (ligand-centered) (.csv)
mutation = False                                      # Analyze mutations. True or False
output_path = github + "out/"                         # Path for the new categorizing folders
pdb_reference_sequence = directory_path + reference   # Path to the pdb file that will be the reference sequence.
entity_reference = 0                                  # '0' means that the first _entity_poly of the pdb_reference_sequence will be the reference sequence
res_threshold = 15                                    # Chose a threshold to discriminate between peptides and the subunits of the protein



## Main Code

In [18]:
"""
MAIN CODE. YOU DO NOT NEED TO CHANGE THIS PART
"""

blacklist, blacklist_dict = read_blacklist("/content/PDB-CAT/blacklist.txt") # Path to the blacklist file that contain the codes of the small molecules not considered ligands

# READ THE REFERENCE SEQUENCE. It is a PDB file in CIF format.
reference_seq=''
if mutation == True:
    ref_cfr = CifFileReader()
    ref_cif_obj = ref_cfr.read(pdb_reference_sequence, output='cif_wrapper', ignore=['_atom_site'])
    ref_cif_data = list(ref_cif_obj.values())[0]
    if '_entity_poly' in ref_cif_data:
        reference_seq = ref_cif_data['_entity_poly']['pdbx_seq_one_letter_code_can'][entity_reference]
        reference_seq = reference_seq.replace("\n","")

# First csv output. Protein-centered
# Second csv output. Ligand-centered

data = []
data_ligands = []
fields_to_include = ["PDB_ID", "Ligand", "Ligand_names","Ligand_types", "Ligand_functions", "Covalent_Bond", "Bond"]
fields_to_append = ["PDB_ID"]

for filename in os.listdir(directory_path):
    if filename.endswith('.cif'):
        file_path = os.path.join(directory_path, filename)
        data_from_file = process_cif_file(file_path, mutation, blacklist, reference_seq, res_threshold)
        data.append(data_from_file)

        # Split ligand names and create a new row for each ligand
        ligands = data_from_file["Ligand"].split('\n')
        ligand_names_list = data_from_file["Ligand_names"].split('\n')
        ligand_types_list = data_from_file["Ligand_types"].split('\n')
        covalent_bond_list = data_from_file["Covalent_Bond"].split('\n')
        ligand_covalents_bond = data_from_file["Bond"].split('\n')
        descarted_ligands = data_from_file["Discarted_Ligands"].split('\n')
        branched_molecules = data_from_file["Branched"].split('\n')
        branched_name = data_from_file["Branched_name"].split('\n')
        branched_type = data_from_file["Branched_type"].split('\n')
        branched_covalent = data_from_file["Branched_Covalent"].split('\n')
        branched_bond = data_from_file["Branched_Bond"].split('\n')


        # Find the maximum length among the three lists
        max_length = max(len(ligands), len(ligand_names_list), len(ligand_types_list), len(covalent_bond_list), len(ligand_covalents_bond), len(descarted_ligands), len(branched_molecules))

        for i in range(max_length):
            ligand_row = {field: data_from_file[field] for field in fields_to_include}

            # Get the element from each list
            ligand_row["Ligand"]= ligands[i].strip() if i < len(ligands) else ""
            ligand_row["Ligand_names"] = ligand_names_list[i].strip() if i < len(ligand_names_list) else ""
            ligand_row["Ligand_types"] = ligand_types_list[i].strip() if i < len(ligand_types_list) else ""
            ligand_row["Covalent_Bond"] = covalent_bond_list[i].strip() if i < len(covalent_bond_list) else ""
            ligand_row["Bond"] = ligand_covalents_bond[i].strip() if i < len(ligand_covalents_bond) else ""
            data_ligands.append(ligand_row)


            # Add column to the ligands DataFrame and fill it with corresponding information
            if i < len(descarted_ligands) and descarted_ligands[i].strip():  # Ensure there is information before adding
                ligand_row_discarded = {field: data_from_file[field] for field in fields_to_include}
                ligand_row_discarded["Ligand"] = descarted_ligands[i].strip()
                ligand_row_discarded["Ligand_names"] = blacklist_dict[descarted_ligands[i].strip()]
                ligand_row_discarded["Ligand_types"] = "Discarded"
                ligand_row_discarded["Covalent_Bond"] = ""
                ligand_row_discarded["Bond"] = ""
                data_ligands.append(ligand_row_discarded)

            # Add a column to the ligands DataFrame and fill it with the corresponding information
            if i < len(branched_molecules) and branched_molecules[i].strip():
                ligand_row_branched = {field: data_from_file[field] for field in fields_to_include}
                ligand_row_branched["Ligand"] = branched_molecules[i].strip() if i < len(branched_molecules) else ""
                ligand_row_branched["Ligand_names"] = branched_name[i].strip() if i < len(branched_name) else ""
                ligand_row_branched["Ligand_types"] = "Branched"
                ligand_row_branched["Covalent_Bond"] = branched_covalent[i].strip() if i < len(branched_covalent) else ""
                ligand_row_branched["Bond"] = branched_bond[i].strip() if i < len(branched_bond) else ""
                data_ligands.append(ligand_row_branched)

# First csv output. Protein-centered
df = pd.DataFrame(data)  # Create a Pandas df
df.to_csv(out_file, index=False)  # Save the df into a file

# Second csv output. Ligand-centered
df_ligand = pd.DataFrame(data_ligands) # Create a Pandas df

# Remove rows where 'Ligand' is empty or contains only white spaces
df_ligand['Ligand'] = df_ligand['Ligand'].str.strip()
df_ligand = df_ligand[df_ligand['Ligand'] != '']

# Define the new names for the columns
new_header = ['ID', 'Molecule', 'Name', 'Type', 'Function', 'Covalent', 'Bond']
df_ligand.columns = new_header

# Second csv output. Ligand-centered
df_ligand.to_csv(out_file_ligands, index=False) # Save the df into a file

# Classify whether there is a mutation
if mutation == False:
    no_mutated_list = os.listdir(directory_path)
    no_mutated_list = [filename[:-4] for filename in no_mutated_list]

if mutation ==True:
    no_mutated_list, non_mut_path = mutation_classification(directory_path, out_file, output_path)
    output_path = non_mut_path

# Classify depend on the bond
bond_classification(directory_path, out_file, no_mutated_list, output_path, mutation)

## Download results

In [20]:
# @title Package and Download results { display-mode: "form" }
results_zip = f"PDB-CAT-result.zip"
os.system(f"zip -r {results_zip} {output_path} {github}/out")
files.download(results_zip)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>