# 학습 목표
1. 리간드 기반 스크리닝 데이터를 수집하는 방법을 알아본다.
2. 데이터 전처리 방법을 알아본다.

https://pubchem.ncbi.nlm.nih.gov/bioassay/1890

In [None]:
import requests
import pandas as pd

# PubChem BioAssay의 AID 설정 : 1890
assay_id = "1890"
# PubChem PUG REST API URL (CSV 형식 요청)
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{assay_id}/CSV"
print(f"AID {assay_id}의 assay 데이터를 다운로드 중입니다...")

# 데이터 다운로드
response = requests.get(url)
csv_filename = f"assay_{assay_id}.csv"
with open(csv_filename, "w", encoding="utf-8") as file:
    file.write(response.text)

# pandas를 통해 CSV 파일 로드 및 미리보기
df = pd.read_csv(csv_filename)
df

AID 1890의 assay 데이터를 다운로드 중입니다...


Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Qualifier,IC50,...,Inhibition at 3.0 nM,Inhibition at 9.1 nM,Inhibition at 27.3 nM,Inhibition at 81.8 nM,Inhibition at 245.4 nM,Inhibition at 736.3 nM,Inhibition at 2.2 uM,Inhibition at 6.6 uM,Inhibition at 19.9 uM,Inhibition at 59.6 uM
0,RESULT_TYPE,,,,,,,,STRING,FLOAT,...,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT
1,RESULT_DESCR,,,,,,,,Activity Qualifier identifies if the resultant...,The concentration at which 50 percent of the a...,...,Value of %inhibition at 3.0 nanomolar inhibito...,Value of %inhibition at 9.0 nanomolar inhibito...,Value of %inhibition at 27.3 nanomolar inhibit...,Value of %inhibition at 81.8 nanomolar inhibit...,Value of %inhibition at 245 nanomolar inhibito...,Value of %inhibition at 736 nanomolar inhibito...,Value of %inhibition at 2.2 micromolar inhibit...,Value of %inhibition at 6.6 micromolar inhibit...,Value of %inhibition at 19.9 micromolar inhibi...,Value of %inhibition at 59.6 micromolar inhibi...
2,RESULT_UNIT,,,,,,,,,MICROMOLAR,...,PERCENT,PERCENT,PERCENT,PERCENT,PERCENT,PERCENT,PERCENT,PERCENT,PERCENT,PERCENT
3,RESULT_IS_ACTIVE_CONCENTRATION,,,,,,,,,TRUE,...,,,,,,,,,,
4,RESULT_ATTR_CONC_MICROMOL,,,,,,,,,,...,0.003,0.009,0.0273,0.0818,0.245,0.736,2.2,6.6,19.9,59.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,97,47202254.0,2966254.0,C1=CC=C2C(=C1)C(=C(N2)C(=O)O)C(C3=C(C=CC(=C3)B...,Inactive,0.0,,,>,59.64,...,-0.9,0.9,-0.5,1.3,1.9,2.5,4.5,14.3,33.6,42.9
102,98,845167.0,647501.0,CCN1C2=NC(=O)N(C(=O)C2=NC(=N1)C3=CC=CC=C3)C,Inactive,0.0,,,>,59.64,...,-1.1,-1.7,-1.1,-0.2,1.1,-192.9,-227.1,-47.1,28.9,48.1
103,99,3711170.0,2997527.0,CC1=CC(=NO1)NC(=O)COC(=O)C(C(C)C)NC2=C(C=C(C=C...,Inactive,0.0,,,>,59.64,...,-1.2,-2.3,-1.9,-1.5,0.4,0,3.4,11.5,22,29.7
104,100,14740696.0,1120962.0,COC1=CC=C(C=C1)C(=O)OC2=CN=CC=C2,Inactive,0.0,,,>,59.64,...,-2.2,-2.4,-3.1,-0.5,-0.7,-2.9,-2.5,-1.7,-1.7,0.1


In [None]:
df[['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME']]

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,,
1,,
2,,
3,,
4,,
...,...,...
101,C1=CC=C2C(=C1)C(=C(N2)C(=O)O)C(C3=C(C=CC(=C3)B...,Inactive
102,CCN1C2=NC(=O)N(C(=O)C2=NC(=N1)C3=CC=CC=C3)C,Inactive
103,CC1=CC(=NO1)NC(=O)COC(=O)C(C(C)C)NC2=C(C=C(C=C...,Inactive
104,COC1=CC=C(C=C1)C(=O)OC2=CN=CC=C2,Inactive


In [None]:
import pandas as pd

df_subset = df[['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME']].dropna().reset_index(drop=True)

# 컬럼 이름 변경: 'smiles'와 'label'
df_subset = df_subset.rename(columns={
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'smiles',
    'PUBCHEM_ACTIVITY_OUTCOME': 'label'
})

# label 값 변환: active -> 1, inactive -> 0
df_subset['label'] = df_subset['label'].str.lower().map({'active': 1, 'inactive': 0})

# 결과 미리보기
df_subset.to_csv('3clpro.csv')
df_subset

Unnamed: 0,smiles,label
0,COC1=CC=C(C=C1)C(=O)N2C3=CC4=C(C=C3N=C2SC)OCCO4,1
1,COC1=C(C=C(C=C1)C2CC(=NN2C(=O)CCl)C3=CC=CS3)OC,1
2,CCC(C)C1=CC=C(C=C1)N(C(C2=CN=CC=C2)C(=O)NC3CCC...,1
3,COC1=C(C=CC(=C1)C2CC(=NN2C(=O)CCl)C3=CC=CS3)OC...,1
4,CCC1=CC(=O)OC2=C1C=CC(=C2)OC(=O)C3=CC=CO3,1
...,...,...
96,C1=CC=C2C(=C1)C(=C(N2)C(=O)O)C(C3=C(C=CC(=C3)B...,0
97,CCN1C2=NC(=O)N(C(=O)C2=NC(=N1)C3=CC=CC=C3)C,0
98,CC1=CC(=NO1)NC(=O)COC(=O)C(C(C)C)NC2=C(C=C(C=C...,0
99,COC1=CC=C(C=C1)C(=O)OC2=CN=CC=C2,0
