# 학습 목표
1. 구조 기반 가상 스크리닝(SBVS)의 개념과 필요한 데이터 이해하기
2. 주요 단백질-리간드 데이터베이스의 특징과 접근 방법 학습하기

# 1. 주요 단백질-리간드 데이터베이스
- PDB (Protein Data Bank): https://www.rcsb.org/
- PDBbind: http://www.pdbbind.org.cn/
- BindingDB: https://www.bindingdb.org/rwd/bind/index.jsp
- DUD-E (Directory of Useful Decoys - Enhanced): https://dude.docking.org/

# 2. BindingDB Articles

In [None]:
# 필요 라이브러리 설치
!pip install biopandas py3Dmol rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [None]:
!wget https://www.bindingdb.org/rwd/bind/downloads/BindingDB_BindingDB_Articles_2D_202503_sdf.zip

import os
import zipfile

zip_path = "BindingDB_BindingDB_Articles_2D_202503_sdf.zip"

# 압축 해제할 디렉토리
extract_dir = "bindingdb_data"

# 디렉토리가 없으면 생성
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

# 압축 해제
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

--2025-03-29 16:47:55--  https://www.bindingdb.org/rwd/bind/downloads/BindingDB_BindingDB_Articles_2D_202503_sdf.zip
Resolving www.bindingdb.org (www.bindingdb.org)... 132.239.186.19
Connecting to www.bindingdb.org (www.bindingdb.org)|132.239.186.19|:443... connected.
HTTP request sent, awaiting response... 200 
Length: 37104817 (35M) [application/zip]
Saving to: ‘BindingDB_BindingDB_Articles_2D_202503_sdf.zip’


2025-03-29 16:47:57 (26.6 MB/s) - ‘BindingDB_BindingDB_Articles_2D_202503_sdf.zip’ saved [37104817/37104817]



In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem

def process_sdf_file(sdf_path):
    compounds = []
    suppl = Chem.SDMolSupplier(sdf_path)

    for mol in suppl:
        if mol is None:
            continue

        # 분자 속성 추출
        prop_dict = {}

        # 기본 속성 추출
        for prop_name in mol.GetPropNames():
            prop_dict[prop_name] = mol.GetProp(prop_name)

        # SMILES 생성
        prop_dict['SMILES'] = Chem.MolToSmiles(mol)

        compounds.append(prop_dict)

    return compounds

# 압축 해제된 SDF 파일 경로
sdf_file = os.path.join('bindingdb_data/BindingDB_BindingDB_Articles_2D.sdf')

# SDF 처리
compounds_data = process_sdf_file(sdf_file)

[16:49:05] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 26
[16:49:05] ERROR: Could not sanitize molecule ending on line 15956
[16:49:05] ERROR: Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 26
[16:49:35] Explicit valence for atom # 27 N, 4, is greater than permitted
[16:49:35] ERROR: Could not sanitize molecule ending on line 8249825
[16:49:35] ERROR: Explicit valence for atom # 27 N, 4, is greater than permitted
[16:49:35] Explicit valence for atom # 27 N, 4, is greater than permitted
[16:49:35] ERROR: Could not sanitize molecule ending on line 8252267
[16:49:35] ERROR: Explicit valence for atom # 27 N, 4, is greater than permitted
[16:49:57] Explicit valence for atom # 24 O, 3, is greater than permitted
[16:49:57] ERROR: Could not sanitize molecule ending on line 13706367
[16:49:57] ERROR: Explicit valence for atom # 24 O, 3, is greater than permitted
[16:49:57] Explicit valence for atom # 24 O, 3, is greater than permitted
[16:49:57] ERROR: Could not sanitize molecule e

In [None]:
# 데이터프레임 생성
df = pd.DataFrame(compounds_data)

# 컬럼 선택
required_columns = ['BindingDB MonomerID', 'BindingDB Ligand Name', 'SMILES',
                   'BindingDB Target Chain Sequence', 'PDB ID(s) of Target Chain',
                   'Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'Target Name']

# 존재하는 컬럼만 선택
available_columns = [col for col in required_columns if col in df.columns]
df_final = df[available_columns]

# 결과 확인
print(f"데이터프레임 크기: {df_final.shape}")
print(f"사용 가능한 컬럼: {df_final.columns.tolist()}")

데이터프레임 크기: (92852, 10)
사용 가능한 컬럼: ['BindingDB MonomerID', 'BindingDB Ligand Name', 'SMILES', 'BindingDB Target Chain Sequence', 'PDB ID(s) of Target Chain', 'Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'Target Name']


In [None]:
df_final

Unnamed: 0,BindingDB MonomerID,BindingDB Ligand Name,SMILES,BindingDB Target Chain Sequence,PDB ID(s) of Target Chain,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),Target Name
0,4521,3-quinolinecarbonitrile 3::4-Phenylamino-3-qui...,COc1cc2c(Nc3ccc(Cl)cc3Cl)c(C#N)cnc2cc1OCCCN1CC...,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,"1FMK,1HCS,1HCT,1O41,1O42,1O43,1O44,1O45,1O46,1...",,8.7,,,Proto-oncogene tyrosine-protein kinase Src
1,6121,3-quinolinecarbonitrile 4::6-methoxy-7-[3-(4-m...,COc1cc2c(Nc3cc(OC)c(OC)c(OC)c3)c(C#N)cnc2cc1OC...,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,"1FMK,1HCS,1HCT,1O41,1O42,1O43,1O44,1O45,1O46,1...",,5.1,,,Proto-oncogene tyrosine-protein kinase Src
2,6122,"3-quinolinecarbonitrile 8::4-[(2,4-dichloro-5-...",COCCOc1cc2ncc(C#N)c(Nc3cc(OC)c(Cl)cc3Cl)c2cc1O...,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,"1FMK,1HCS,1HCT,1O41,1O42,1O43,1O44,1O45,1O46,1...",,2.8,,,Proto-oncogene tyrosine-protein kinase Src
3,6123,"3-quinolinecarbonitrile 9::4-[(2,4-dichlorophe...",COCCOc1cc2ncc(C#N)c(Nc3ccc(Cl)cc3Cl)c2cc1OCCOC,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,"1FMK,1HCS,1HCT,1O41,1O42,1O43,1O44,1O45,1O46,1...",,12,,,Proto-oncogene tyrosine-protein kinase Src
4,6124,"3-quinolinecarbonitrile 10::6,7-bis(2-methoxye...",COCCOc1cc2ncc(C#N)c(Nc3cc(OC)c(OC)c(OC)c3)c2cc...,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,"1FMK,1HCS,1HCT,1O41,1O42,1O43,1O44,1O45,1O46,1...",,25,,,Proto-oncogene tyrosine-protein kinase Src
...,...,...,...,...,...,...,...,...,...,...
92847,536422,"(S)-5-(1-(2-((3-chloro-6- (2,4-dimethylpiperaz...",C[C@H]1CN(C)CCN1c1cc(NC(=O)Cn2cc(-c3cc(C(N)=O)...,ADSCIQFTRHASDVLLNLNRLRSRDILTDVVIVVSREQFRAHKTVL...,,,,6.00,,B-cell lymphoma 6 protein [5-128]
92848,536462,"(S)-5-(1-(2-((3-chloro- 6-(2,4-dimethyl- piper...",C[C@H]1CN(C)CCN1c1cc(NC(=O)Cn2cc(-c3cc(C(N)=O)...,ADSCIQFTRHASDVLLNLNRLRSRDILTDVVIVVSREQFRAHKTVL...,,,,26,,B-cell lymphoma 6 protein [5-128]
92849,536462,"(S)-5-(1-(2-((3-chloro- 6-(2,4-dimethyl- piper...",C[C@H]1CN(C)CCN1c1cc(NC(=O)Cn2cc(-c3cc(C(N)=O)...,ADSCIQFTRHASDVLLNLNRLRSRDILTDVVIVVSREQFRAHKTVL...,,,,26,,B-cell lymphoma 6 protein [5-128]
92850,536470,"(S)-5-(1-(2-((3-chloro- 6-(2,4-dimethyl- piper...",C[C@H]1CN(C)CCN1c1cc(NC(=O)Cn2cc(-c3cc(C(N)=O)...,ADSCIQFTRHASDVLLNLNRLRSRDILTDVVIVVSREQFRAHKTVL...,,,,47,,B-cell lymphoma 6 protein [5-128]


# 3. 지표 이해
- Ki (억제 상수): 효소와 억제제 사이의 결합 친화도를 나타냅니다. 낮을수록 강한 결합을 의미
- IC50: 효소 활성이나 결합을 50% 억제하는 데 필요한 화합물의 농도입니다. 낮을수록 강력
- Kd (해리 상수): 단백질-리간드 복합체의 해리 경향을 나타냅니다. 낮을수록 강한 결합을 의미
- EC50: 최대 반응의 50%를 유도하는 데 필요한 약물의 농도입니다. 낮을수록 효과적

* 100~1000nm정도를 cutoff로 이용 가능