In [7]:
from pathlib import Path
from plip.structure.preparation import PDBComplex
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [8]:
HERE = Path(".").resolve()
DATA_DIR = HERE / "data"
PDB_DIR = HERE / "data_pdb"
PDB_DIR.mkdir(exist_ok=True)

In [9]:
#функция для конвертирования .cif. в pdb.:
import subprocess
import os

def convert_cif_to_pdb(cif_path, pdb_path):
    try:
        subprocess.run(
            ["obabel", str(cif_path), "-O", str(pdb_path)],
            check=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL
        )
        return True
    except Exception as e:
        print(f"Не удалось конвертировать {cif_path.name}: {e}")
        return False

In [10]:
#найдем все .cif файлы
cif_files = list(DATA_DIR.glob("*.cif"))
print(f"Найдено {len(cif_files)} .cif файлов")

Найдено 83 .cif файлов


In [11]:
# Конвертация
pdb_files = []
for cif in cif_files:
    pdb_name = cif.stem + ".pdb"
    pdb_path = PDB_DIR / pdb_name
    if convert_cif_to_pdb(cif, pdb_path):
        pdb_files.append(pdb_path)

print(f"Успешно конвертировано: {len(pdb_files)} файлов")

Успешно конвертировано: 83 файлов


In [18]:
#функция для анализа типов связей через PLIP
AMINO_ACIDS = {
    'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY', 'HIS', 'ILE',
    'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL',
    '2AS', '3AH', '5HP', '5OW', 'ACL', 'AGM', 'AIB', 'ALM', 'ALO', 'ALY', 'ARM',
    'ASA', 'ASB', 'ASK', 'ASL', 'ASQ', 'AYA', 'BCS', 'BHD', 'BMT', 'BNN',
    'BUC', 'BUG', 'C5C', 'C6C', 'CAS', 'CCS', 'CEA', 'CGU', 'CHG', 'CLE', 'CME',
    'CSD', 'CSO', 'CSP', 'CSS', 'CSW', 'CSX', 'CXM', 'CY1', 'CY3', 'CYG',
    'CYM', 'CYQ', 'DAH', 'DAL', 'DAR', 'DAS', 'DCY', 'DGL', 'DGN', 'DHA',
    'DHI', 'DIL', 'DIV', 'DLE', 'DLY', 'DNP', 'DPN', 'DPR', 'DSN', 'DSP',
    'DTH', 'DTR', 'DTY', 'DVA', 'EFC', 'FLA', 'FME', 'GGL', 'GL3', 'GLZ',
    'GMA', 'GSC', 'HAC', 'HAR', 'HIC', 'HIP', 'HMR', 'HPQ', 'HTR', 'HYP',
    'IAS', 'IIL', 'IYR', 'KCX', 'LLP', 'LLY', 'LTR', 'LYM', 'LYZ', 'MAA', 'MEN',
    'MHS', 'MIS', 'MK8', 'MLE', 'MPQ', 'MSA', 'MSE', 'MVA', 'NEM', 'NEP', 'NLE',
    'NLN', 'NLP', 'NMC', 'OAS', 'OCS', 'OMT', 'PAQ', 'PCA', 'PEC', 'PHI',
    'PHL', 'PR3', 'PRR', 'PTR', 'PYX', 'SAC', 'SAR', 'SCH', 'SCS', 'SCY',
    'SEL', 'SEP', 'SET', 'SHC', 'SHR', 'SMC', 'SOC', 'STY', 'SVA', 'TIH',
    'TPL', 'TPO', 'TPQ', 'TRG', 'TRO', 'TYB', 'TYI', 'TYQ', 'TYS', 'TYY', 'YCM',
    'GOL', 'PEG', 'PO4', 'SO4', 'ACT', 'CL', 'MES', 'EDO', 'YTH', 'CA'
}

def get_interactions_from_pdb(pdb_path):
    protlig = PDBComplex()
    protlig.load_pdb(str(pdb_path))
    protlig.analyze()

    valid_sites = []
    for bsid, site in protlig.interaction_sets.items():
        ligand_name = bsid.split(":")[0]
        if ligand_name not in AMINO_ACIDS:
            valid_sites.append((bsid, site))

    if not valid_sites:
        return {}, "N/A" 

    # Берём первый валидный сайт
    first_bsid, site = valid_sites[0]
    ligand_name = first_bsid.split(":")[0]

    residue_interactions = {}
    interaction_map = {
        "hydrophobic": site.hydrophobic_contacts,
        "hbond": site.hbonds_ldon + site.hbonds_pdon,
        "waterbridge": site.water_bridges,
        "saltbridge": site.saltbridge_lneg + site.saltbridge_pneg,
        "pistacking": site.pistacking,
        "pication": site.pication_laro + site.pication_paro,
        "halogen": site.halogen_bonds,
        "metal": site.metal_complexes,
    }

    for itype, interactions in interaction_map.items():
        for i in interactions:
            res_key = f"{i.restype}_{i.reschain}:{i.resnr}"
            if res_key not in residue_interactions:
                residue_interactions[res_key] = set()
            residue_interactions[res_key].add(itype)

    return residue_interactions, ligand_name

In [19]:
all_residues = set()
data = {}
ligands = {}

for pdb_file in pdb_files:
    pdb_id = pdb_file.stem
    try:
        interactions, ligand_name = get_interactions_from_pdb(pdb_file)
        data[pdb_id] = interactions
        ligands[pdb_id] = ligand_name
        all_residues.update(interactions.keys())
    except Exception as e:
        print(f"    Пропущен: {e}")
        data[pdb_id] = {}
        ligands[pdb_id] = "N/A"

In [20]:
total_files = len(pdb_files)
processed_files = sum(1 for v in data.values() if v)
skipped_files = total_files - processed_files

print(f"Всего файлов: {total_files}")
print(f"Успешно обработано: {processed_files}")
print(f"Пропущено: {skipped_files}")

Всего файлов: 83
Успешно обработано: 83
Пропущено: 0


In [21]:
#формируем датасет
rows = []
for pdb_id in data:
    row = {
        "Ligand": ligands[pdb_id],
        "Protein": pdb_id
    }
    for res in sorted(all_residues):
        types = data[pdb_id].get(res, set())
        row[res] = ", ".join(sorted(types)) if types else 0
    rows.append(row)

df = pd.DataFrame(rows)
df = df.set_index(["Ligand", "Protein"])

In [22]:
output_file = HERE / "protein_rightligand_interaction2.0.csv"
df.to_csv(output_file)

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ALA_A:157,ALA_A:158,ALA_A:167,ALA_A:172,ALA_A:177,ALA_A:178,ALA_A:211,ALA_A:315,ALA_A:48,ALA_A:49,...,VAL_A:83,VAL_A:84,VAL_A:85,VAL_A:87,VAL_A:88,VAL_A:90,VAL_A:91,VAL_A:93,VAL_A:94,VAL_B:87
Ligand,Protein,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
T20,4RMZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,hydrophobic,0,0
STU,4U9A,0,0,hbond,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42P,4XS2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
XPY,4Y73,hbond,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4GD,4YO6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
df_reset = df.reset_index()

residue_cols = df_reset.columns[2:]
df_reset["total_interactions"] = df_reset[residue_cols].apply(lambda row: (row != 0).sum(), axis=1)
df_sorted = df_reset.sort_values("total_interactions", ascending=False)
print(df_sorted[["Ligand", "Protein", "total_interactions"]])

   Ligand Protein  total_interactions
26    J8A    6EGA                  13
48    R7S    6VQL                  11
77    A1B    9NA4                  11
56    O6X    8ATL                  10
82    A1J    9R9K                  10
..    ...     ...                 ...
38    LSV    6O95                   4
53    B4U    7QG3                   4
62    ZVD    8SCE                   4
52    B6I    7QG2                   3
69    WFQ    8UCC                   3

[83 rows x 3 columns]


In [25]:
row = df_reset[df_reset["Protein"] == "6EGA"].iloc[0]

nonzero_interactions = row[row != 0]
residue_cols = [col for col in nonzero_interactions.index if col not in ["Ligand", "Protein", "total_interactions"]]
result = nonzero_interactions[residue_cols]

print("Взаимодействия для 6EGA:")
for residue, interaction_types in result.items():
    print(f"  {residue}: {interaction_types}")

Взаимодействия для 6EGA:
  ALA_A:63: hydrophobic
  ASP_A:181: hbond, hydrophobic, saltbridge
  GLU_A:85: hbond, hydrophobic
  HIS_A:161: pication
  ILE_A:160: hbond
  ILE_A:179: halogen
  LEU_A:170: hydrophobic
  LYS_A:65: hydrophobic
  MET_A:117: hbond
  PHE_A:182: pistacking
  TYR_A:114: hydrophobic, pistacking
  TYR_A:116: hydrophobic
  VAL_A:52: hydrophobic
