In [3]:
from pathlib import Path
from plip.structure.preparation import PDBComplex
import pandas as pd
import warnings
import subprocess
import os
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
HERE = Path(".").resolve()
PDB_DIR = HERE / "pdb_api"
PDB_DIR.mkdir(exist_ok=True)

In [4]:
pdb_files = [f for f in os.listdir(PDB_DIR) if f.lower().endswith(".pdb")]

In [6]:
#функция для анализа типов связей через PLIP
AMINO_ACIDS = {
    'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY', 'HIS', 'ILE',
    'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL',
    '2AS', '3AH', '5HP', '5OW', 'ACL', 'AGM', 'AIB', 'ALM', 'ALO', 'ALY', 'ARM',
    'ASA', 'ASB', 'ASK', 'ASL', 'ASQ', 'AYA', 'BCS', 'BHD', 'BMT', 'BNN',
    'BUC', 'BUG', 'C5C', 'C6C', 'CAS', 'CCS', 'CEA', 'CGU', 'CHG', 'CLE', 'CME',
    'CSD', 'CSO', 'CSP', 'CSS', 'CSW', 'CSX', 'CXM', 'CY1', 'CY3', 'CYG',
    'CYM', 'CYQ', 'DAH', 'DAL', 'DAR', 'DAS', 'DCY', 'DGL', 'DGN', 'DHA',
    'DHI', 'DIL', 'DIV', 'DLE', 'DLY', 'DNP', 'DPN', 'DPR', 'DSN', 'DSP',
    'DTH', 'DTR', 'DTY', 'DVA', 'EFC', 'FLA', 'FME', 'GGL', 'GL3', 'GLZ',
    'GMA', 'GSC', 'HAC', 'HAR', 'HIC', 'HIP', 'HMR', 'HPQ', 'HTR', 'HYP',
    'IAS', 'IIL', 'IYR', 'KCX', 'LLP', 'LLY', 'LTR', 'LYM', 'LYZ', 'MAA', 'MEN',
    'MHS', 'MIS', 'MK8', 'MLE', 'MPQ', 'MSA', 'MSE', 'MVA', 'NEM', 'NEP', 'NLE',
    'NLN', 'NLP', 'NMC', 'OAS', 'OCS', 'OMT', 'PAQ', 'PCA', 'PEC', 'PHI',
    'PHL', 'PR3', 'PRR', 'PTR', 'PYX', 'SAC', 'SAR', 'SCH', 'SCS', 'SCY',
    'SEL', 'SEP', 'SET', 'SHC', 'SHR', 'SMC', 'SOC', 'STY', 'SVA', 'TIH',
    'TPL', 'TPO', 'TPQ', 'TRG', 'TRO', 'TYB', 'TYI', 'TYQ', 'TYS', 'TYY', 'YCM',
    'GOL', 'PEG', 'PO4', 'SO4', 'ACT', 'CL', 'MES', 'EDO', 'YTH', 'CA'
}

def get_interactions_from_pdb(pdb_path):
    protlig = PDBComplex()
    protlig.load_pdb(str(pdb_path))
    protlig.analyze()

    valid_sites = []
    for bsid, site in protlig.interaction_sets.items():
        ligand_name = bsid.split(":")[0]
        if ligand_name not in AMINO_ACIDS:
            valid_sites.append((bsid, site))

    if not valid_sites:
        return {}, "N/A" 

    # Берём первый валидный сайт
    first_bsid, site = valid_sites[0]
    ligand_name = first_bsid.split(":")[0]

    residue_interactions = {}
    interaction_map = {
        "hydrophobic": site.hydrophobic_contacts,
        "hbond": site.hbonds_ldon + site.hbonds_pdon,
        "waterbridge": site.water_bridges,
        "saltbridge": site.saltbridge_lneg + site.saltbridge_pneg,
        "pistacking": site.pistacking,
        "pication": site.pication_laro + site.pication_paro,
        "halogen": site.halogen_bonds,
        "metal": site.metal_complexes,
    }

    for itype, interactions in interaction_map.items():
        for i in interactions:
            res_key = f"{i.restype}_{i.reschain}:{i.resnr}"
            if res_key not in residue_interactions:
                residue_interactions[res_key] = set()
            residue_interactions[res_key].add(itype)

    return residue_interactions, ligand_name

In [7]:
pdb_files

['4RMZ.pdb',
 '4U9A.pdb',
 '4XS2.pdb',
 '4Y73.pdb',
 '4YO6.pdb',
 '4YP8.pdb',
 '4ZTL.pdb',
 '4ZTM.pdb',
 '4ZTN.pdb',
 '5K72.pdb',
 '5K75.pdb',
 '5K76.pdb',
 '5K7G.pdb',
 '5K7I.pdb',
 '5KX7.pdb',
 '5KX8.pdb',
 '5T1S.pdb',
 '5T1T.pdb',
 '5UIQ.pdb',
 '5UIR.pdb',
 '5UIS.pdb',
 '5UIT.pdb',
 '5UIU.pdb',
 '5W84.pdb',
 '5W85.pdb',
 '6EG9.pdb',
 '6EGA.pdb',
 '6EGD.pdb',
 '6EGE.pdb',
 '6F3D.pdb',
 '6F3E.pdb',
 '6F3G.pdb',
 '6F3I.pdb',
 '6LXY.pdb',
 '6MOM.pdb',
 '6N8G.pdb',
 '6O8U.pdb',
 '6O94.pdb',
 '6O95.pdb',
 '6O9D.pdb',
 '6RFI.pdb',
 '6RFJ.pdb',
 '6THW.pdb',
 '6THX.pdb',
 '6THZ.pdb',
 '6TI8.pdb',
 '6TIA.pdb',
 '6UYA.pdb',
 '6VQL.pdb',
 '7C2V.pdb',
 '7C2W.pdb',
 '7QG1.pdb',
 '7QG2.pdb',
 '7QG3.pdb',
 '7QG5.pdb',
 '8DKS.pdb',
 '8SCE.pdb',
 '8SCV.pdb',
 '8SCW.pdb',
 '8TVM.pdb',
 '8TVN.pdb',
 '8TX0.pdb',
 '8UCB.pdb',
 '8UCC.pdb',
 '8V1O.pdb',
 '8V2F.pdb',
 '8V2L.pdb',
 '9R9G.pdb']

In [16]:
all_residues = set()
data = {}
ligands = {}

for pdb_file in tqdm(pdb_files):
    pdb_path = os.path.join(PDB_DIR, pdb_file)
    pdb_id = pdb_file[:4]
    try:
        interactions, ligand_name = get_interactions_from_pdb(pdb_path)
        data[pdb_id] = interactions
        ligands[pdb_id] = ligand_name
        all_residues.update(interactions.keys())
    except Exception as e:
        print(f"    Пропущен: {e}")
        data[pdb_id] = {}
        ligands[pdb_id] = "N/A"

100%|██████████████████████████████████████████████████████████████████████████████████| 68/68 [05:02<00:00,  4.45s/it]


In [18]:
total_files = len(pdb_files)
processed_files = sum(1 for v in data.values() if v)
skipped_files = total_files - processed_files

print(f"Всего файлов: {total_files}")
print(f"Успешно обработано: {processed_files}")
print(f"Пропущено: {skipped_files}")

Всего файлов: 68
Успешно обработано: 68
Пропущено: 0


In [24]:
#формируем датасет
def is_chain_A(residue_key):
        chain_part = residue_key.split("_", 1)[1]
        chain = chain_part.split(":")[0]           
        return chain == "A"

rows = []
residues_chain_A = [res for res in sorted(all_residues) if is_chain_A(res)]

for pdb_id in data:
    row = {
        "Ligand": ligands[pdb_id],
        "Protein": pdb_id
    }
    for res in residues_chain_A:
        types = data[pdb_id].get(res, set())
        row[res] = ", ".join(sorted(types)) if types else 0
    rows.append(row)

df = pd.DataFrame(rows)
df = df.set_index(["Ligand", "Protein"])

In [25]:
output_file = HERE / "protein_rightligand_interaction2.0.csv"
df.to_csv(output_file)

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ALA_A:211,ALA_A:315,ARG_A:273,ASN_A:267,ASN_A:316,ASP_A:272,ASP_A:278,ASP_A:329,GLU_A:194,GLU_A:233,...,PRO_A:266,SER_A:269,SER_A:328,THR_A:280,TYR_A:262,TYR_A:264,VAL_A:200,VAL_A:236,VAL_A:246,VAL_A:263
Ligand,Protein,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
T20,4RMZ,hydrophobic,0,0,0,0,0,0,0,0,0,...,0,0,0,hydrophobic,hydrophobic,hydrophobic,hydrophobic,0,hydrophobic,0
STU,4U9A,0,hbond,0,0,0,0,0,0,0,0,...,0,0,0,0,hydrophobic,0,0,0,0,0
42P,4XS2,0,0,0,0,0,0,0,hydrophobic,0,0,...,0,hbond,0,0,hydrophobic,hbond,hydrophobic,0,0,halogen
XPY,4Y73,hydrophobic,hbond,waterbridge,0,0,halogen,0,hbond,0,0,...,0,0,0,0,0,0,hydrophobic,0,0,0
4GD,4YO6,hydrophobic,0,waterbridge,0,0,0,0,0,0,0,...,0,0,0,0,0,hydrophobic,hydrophobic,0,0,0


In [26]:
df_reset = df.reset_index()

residue_cols = df_reset.columns[2:]
df_reset["total_interactions"] = df_reset[residue_cols].apply(lambda row: (row != 0).sum(), axis=1)
df_sorted = df_reset.sort_values("total_interactions", ascending=False)
print(df_sorted[["Ligand", "Protein", "total_interactions"]])

   Ligand Protein  total_interactions
57    ZVG    8SCV                  13
26    J8A    6EGA                  13
24    9YS    5W85                  12
66    YK0    8V2L                  11
47    QL7    6UYA                  11
..    ...     ...                 ...
15    6YE    5KX8                   4
52    B6I    7QG2                   4
28    DL1    6EGE                   0
14    6YD    5KX7                   0
27    J87    6EGD                   0

[68 rows x 3 columns]


In [27]:
row = df_reset[df_reset["Protein"] == "6EGA"].iloc[0]

nonzero_interactions = row[row != 0]
residue_cols = [col for col in nonzero_interactions.index if col not in ["Ligand", "Protein", "total_interactions"]]
result = nonzero_interactions[residue_cols]

print("Взаимодействия для 6EGA:")
for residue, interaction_types in result.items():
    print(f"  {residue}: {interaction_types}")

Взаимодействия для 6EGA:
  ALA_A:211: hydrophobic
  ASP_A:329: hbond, hydrophobic, saltbridge
  GLU_A:233: hbond, hydrophobic
  HIS_A:309: pication
  ILE_A:308: hbond
  ILE_A:327: halogen
  LEU_A:318: hydrophobic
  LYS_A:213: hydrophobic
  MET_A:265: hbond
  PHE_A:330: pistacking
  TYR_A:262: hydrophobic, pistacking
  TYR_A:264: hydrophobic
  VAL_A:200: hydrophobic


In [28]:
import glob
from IPython.display import Image

# Запустить PLIP
!plip -f data_pdb/6ega.pdb -x -y

# Найти и отобразить PNG-файлы
png_files = glob.glob("6ega_*.png")
for f in png_files:
    display(Image(filename=f, width=600))

2025-12-11 00:01:20,121 [ERROR] [plipcmd.py:393] plip.plipcmd: PyMOL is required for the --pics and --pymol option


In [4]:
!grep "HETATM" data_pdb/6ega.pdb | head -5

"grep" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.
