# 실습 2: 항암제 반응성 데이터로, 신규 약물에 대한 반응 예측하기


## 참고자료
- DrugBank (https://www.drugbank.ca/)
- RDKit 문서 (https://www.rdkit.org/docs/)
- Scikit-learn 문서 (https://scikit-learn.org/)

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.6


In [None]:
# 라이브러리 임포트
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 메시지 숨기기
import warnings
warnings.filterwarnings('ignore')

# 랜덤 시드 설정
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# GPU 사용 가능 여부 확인
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [None]:
# RNA-seq 데이터 다운로드
!wget https://discover.nci.nih.gov/cellminer/download/processeddataset/nci60_RNA__RNA_seq_composite_expression.zip
!wget https://discover.nci.nih.gov/cellminer/download/processeddataset/DTP_NCI60_ZSCORE.zip

import zipfile
# 압축 해제
with zipfile.ZipFile('nci60_RNA__RNA_seq_composite_expression.zip', 'r') as zip_ref:
    zip_ref.extractall('data')
with zipfile.ZipFile('DTP_NCI60_ZSCORE.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

--2024-12-07 06:48:33--  https://discover.nci.nih.gov/cellminer/download/processeddataset/nci60_RNA__RNA_seq_composite_expression.zip
Resolving discover.nci.nih.gov (discover.nci.nih.gov)... 129.43.255.113, 2607:f220:41d:21c1::812b:ff71
Connecting to discover.nci.nih.gov (discover.nci.nih.gov)|129.43.255.113|:443... connected.
HTTP request sent, awaiting response... 200 200
Length: unspecified [application/zip]
Saving to: ‘nci60_RNA__RNA_seq_composite_expression.zip’

nci60_RNA__RNA_seq_     [              <=>   ]  11.66M  4.20MB/s    in 2.8s    

2024-12-07 06:48:36 (4.20 MB/s) - ‘nci60_RNA__RNA_seq_composite_expression.zip’ saved [12227544]

--2024-12-07 06:48:36--  https://discover.nci.nih.gov/cellminer/download/processeddataset/DTP_NCI60_ZSCORE.zip
Resolving discover.nci.nih.gov (discover.nci.nih.gov)... 129.43.255.113, 2607:f220:41d:21c1::812b:ff71
Connecting to discover.nci.nih.gov (discover.nci.nih.gov)|129.43.255.113|:443... connected.
HTTP request sent, awaiting response... 20

In [None]:
import pandas as pd
# RNA-seq 데이터 로드, 상위 10개 행 건너뛰기
rna_df = pd.read_excel('./data/output/RNA__RNA_seq_composite_expression.xls', skiprows=10)

# 약물 반응 데이터 로드, 상위 8개 행 건너뛰기
drug_df = pd.read_excel('./data/output/DTP_NCI60_ZSCORE.xlsx', skiprows=8)

In [None]:
rna_df

Unnamed: 0,Gene name d,Entrez gene id e,Chromosome f,Start f,End f,Cytoband f,BR:MCF7,BR:MDA-MB-231,BR:HS 578T,BR:BT-549,...,PR:PC-3,PR:DU-145,RE:786-0,RE:A498,RE:ACHN,RE:CAKI-1,RE:RXF 393,RE:SN12C,RE:TK-10,RE:UO-31
0,CH17-408M7.1,102724558,1,-1,-1,1q21.1,0.000,0.000,0.112,0.000,...,0.000,0.000,0.000,0.052,0.000,0.000,0.000,0.000,0.000,0.120
1,DDX11L1,100287102,1,11873,14409,1p36.33,0.199,0.000,0.381,0.000,...,0.110,0.131,0.057,0.000,0.164,0.136,0.184,1.092,0.176,0.000
2,WASH7P,653635,1,14361,29370,1p36.33,3.088,0.566,1.771,2.129,...,2.664,1.997,2.564,1.433,2.846,1.899,1.950,2.267,2.695,2.191
3,FAM138A,645520,1,34610,36081,1p36.33,0.000,0.000,0.157,0.000,...,0.222,0.000,0.000,0.076,0.221,0.000,0.000,0.000,0.000,0.000
4,OR4F5,79501,1,69090,70008,1p36.33,0.000,0.000,0.147,0.000,...,0.000,0.000,0.000,0.000,0.263,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23803,MAFIP,727764,Un,53588,115073,-,2.985,0.443,0.684,1.766,...,1.828,1.092,0.000,2.364,0.670,1.697,1.410,1.997,1.896,2.932
23804,LOC283788,283788,Un,56348,99642,-,1.320,0.273,1.166,0.744,...,0.801,0.593,1.459,1.462,1.980,1.084,1.031,0.548,1.124,1.378
23805,KIR2DL5B,553128,Un,86745,96246,19p13.3,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
23806,LOC100288966,100288966,Un,108006,139339,-,0.000,0.000,0.000,0.000,...,0.000,0.130,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [None]:
# 양수(positive z-score): 해당 세포주가 그 약물에 평균보다 더 민감함(더 잘 죽음)
# 음수(negative z-score): 해당 세포주가 그 약물에 평균보다 더 저항성이 있음(덜 죽음)
# 0에 가까운 값: 평균적인 반응
drug_df

Unnamed: 0,NSC # b,Drug name,FDA status,Mechanism of action c,PubChem SID,SMILES d,BR:MCF7,BR:MDA-MB-231,BR:HS 578T,BR:BT-549,...,RE:786-0,RE:A498,RE:ACHN,RE:CAKI-1,RE:RXF 393,RE:SN12C,RE:TK-10,RE:UO-31,Total experiments e,Total after quality control f
0,1,tolylquinone,-,-,-,CC1=CC(=O)C=CC1=O,-0.27,-0.3,-0.82,-0.23,...,-0.52,-1.65,1.66,-0.27,0,-0.39,-0.38,1.06,3,2
1,17,4-AMINO-3-PENTADECYLPHENOL,-,-,219123,CCCCCCCCCCCCCCCc1cc(O)ccc1N,-0.35,-0.3,-0.22,1.48,...,0.52,-0.33,-0.94,0.72,-0.24,-0.32,-0.76,1.12,3,3
2,89,(dimethylamino)propiophenone hydrochloride,-,-,-,CN(C)CCC(=O)c1ccccc1,na,na,na,na,...,0.37,-0.66,-0.14,0.74,-0.44,-0.18,-0.1,-0.12,3,2
3,185,Cactinomycin,-,-,-,C[C@H]1C[C@H](C)C(=O)[C@@H](C1)[C@H](O)CC2CC(=...,na,na,na,na,...,-0.06,na,na,1.68,0.92,1.39,-0.9,0.92,2,1
4,295,2-Phenylbutyric Acid,-,-,4775,OC(=O)CCCc1ccccc1,-0.26,-0.26,-0.26,-0.26,...,4.82,-0.26,-0.26,-0.26,-0.26,-0.26,-0.26,-0.26,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25288,900911,c((L-Pro)-Xxx-Xxx-Xxx-(D-Asp)-) where Xxx = D-...,-,-,-,OC(=O)CC1N[R][R][R]C(=O)C2CCCN2C1=O,-0.17,-0.17,-0.17,-0.17,...,-0.17,-0.17,-0.17,-0.17,1.59,-0.17,-0.17,-0.17,1,1
25289,900922,c((D-Leu)-Xxx-Xxx-Xxx-(D-Asp)-) where Xxx = D-...,-,-,-,OC(=O)CCC1NC(=O)C(CC(=O)O)N[R][R][R]C1=O,-0.17,-0.17,-0.17,-0.17,...,-0.17,-0.17,-0.17,-0.17,-0.17,-0.17,-0.17,-0.17,1,1
25290,900964,"c(Xxx-Pro.psi.(CH2S)Gly-Xxx-Xxx-Asn), where Xx...",-,-,-,NC(=O)C1N[R][R]C(=O)CSCC2CCCN2[R]C1=O,-0.16,-0.16,-0.16,-0.16,...,-0.16,-0.16,-0.16,-0.16,-0.16,-0.16,-0.16,-0.16,1,1
25291,900974,"c(Xxx-Xxx-Pro.psi.(CH2S)Gly-Xxx-Xxx-Xxx-Asn), ...",-,-,-,NC(=O)C1N[R][R][R]C(=O)CSCC2CCCN2[R][R]C1=O,-0.13,-0.13,-0.13,na,...,-0.13,-0.13,-0.13,-0.13,na,-0.13,-0.13,-0.13,1,1


In [None]:
# drug_df의 "na" 문자열을 0으로 치환
drug_df.replace('na', 0, inplace=True)
drug_df


Unnamed: 0,NSC # b,Drug name,FDA status,Mechanism of action c,PubChem SID,SMILES d,BR:MCF7,BR:MDA-MB-231,BR:HS 578T,BR:BT-549,...,RE:786-0,RE:A498,RE:ACHN,RE:CAKI-1,RE:RXF 393,RE:SN12C,RE:TK-10,RE:UO-31,Total experiments e,Total after quality control f
0,1,tolylquinone,-,-,-,CC1=CC(=O)C=CC1=O,-0.27,-0.3,-0.82,-0.23,...,-0.52,-1.65,1.66,-0.27,0,-0.39,-0.38,1.06,3,2
1,17,4-AMINO-3-PENTADECYLPHENOL,-,-,219123,CCCCCCCCCCCCCCCc1cc(O)ccc1N,-0.35,-0.3,-0.22,1.48,...,0.52,-0.33,-0.94,0.72,-0.24,-0.32,-0.76,1.12,3,3
2,89,(dimethylamino)propiophenone hydrochloride,-,-,-,CN(C)CCC(=O)c1ccccc1,0,0,0,0,...,0.37,-0.66,-0.14,0.74,-0.44,-0.18,-0.1,-0.12,3,2
3,185,Cactinomycin,-,-,-,C[C@H]1C[C@H](C)C(=O)[C@@H](C1)[C@H](O)CC2CC(=...,0,0,0,0,...,-0.06,0,0,1.68,0.92,1.39,-0.9,0.92,2,1
4,295,2-Phenylbutyric Acid,-,-,4775,OC(=O)CCCc1ccccc1,-0.26,-0.26,-0.26,-0.26,...,4.82,-0.26,-0.26,-0.26,-0.26,-0.26,-0.26,-0.26,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25288,900911,c((L-Pro)-Xxx-Xxx-Xxx-(D-Asp)-) where Xxx = D-...,-,-,-,OC(=O)CC1N[R][R][R]C(=O)C2CCCN2C1=O,-0.17,-0.17,-0.17,-0.17,...,-0.17,-0.17,-0.17,-0.17,1.59,-0.17,-0.17,-0.17,1,1
25289,900922,c((D-Leu)-Xxx-Xxx-Xxx-(D-Asp)-) where Xxx = D-...,-,-,-,OC(=O)CCC1NC(=O)C(CC(=O)O)N[R][R][R]C1=O,-0.17,-0.17,-0.17,-0.17,...,-0.17,-0.17,-0.17,-0.17,-0.17,-0.17,-0.17,-0.17,1,1
25290,900964,"c(Xxx-Pro.psi.(CH2S)Gly-Xxx-Xxx-Asn), where Xx...",-,-,-,NC(=O)C1N[R][R]C(=O)CSCC2CCCN2[R]C1=O,-0.16,-0.16,-0.16,-0.16,...,-0.16,-0.16,-0.16,-0.16,-0.16,-0.16,-0.16,-0.16,1,1
25291,900974,"c(Xxx-Xxx-Pro.psi.(CH2S)Gly-Xxx-Xxx-Xxx-Asn), ...",-,-,-,NC(=O)C1N[R][R][R]C(=O)CSCC2CCCN2[R][R]C1=O,-0.13,-0.13,-0.13,0,...,-0.13,-0.13,-0.13,-0.13,0,-0.13,-0.13,-0.13,1,1


In [None]:
# drug_df에서 1000개 약물 랜덤 선택(random_state=42)
drug_df = drug_df.sample(n=1000, random_state=42)
drug_df

Unnamed: 0,NSC # b,Drug name,FDA status,Mechanism of action c,PubChem SID,SMILES d,BR:MCF7,BR:MDA-MB-231,BR:HS 578T,BR:BT-549,...,RE:786-0,RE:A498,RE:ACHN,RE:CAKI-1,RE:RXF 393,RE:SN12C,RE:TK-10,RE:UO-31,Total experiments e,Total after quality control f
15925,710861,(2R) Ethyl 3-(2-hydroxypropyl)amino-2-(1H-benz...,-,-,5471366,CCOC(=O)\C(=C/NC[C@@H](C)O)\c1nc2ccccc2[nH]1,-0.17,0,0,-0.5,...,-0.5,-0.1,0.25,-0.5,-0.5,-0.45,-0.09,3.27,1,1
23579,800842,Perifosine,-,-,-,CCCCCCCCCCCCCCCCCCOP(=O)(O)OC1CC[N+](C)(C)CC1,-1.09,-0.2,-1.09,-0.42,...,0.76,0.86,-0.54,0.85,1.06,0.06,2.47,-0.06,1,1
19912,749673,"N1-(2,15-dioxo-1-(pyren-1-yl)-6,9,12-trioxa-3,...",-,-,54613693,CN(CCCNC(=O)CCOCCOCCOCCNC(=O)Cc1ccc2ccc3cccc4c...,0.98,0.2,-0.61,-1.55,...,-0.15,-1.06,-0.42,-0.73,0.27,-0.92,-1.56,-1.23,2,2
21586,767416,-,-,PK:Not Available,-,CN1C(=O)C(=Cc2cnc(NC3CCN(CC3)C(=O)C)nc12)Oc4cc...,-0.2,-0.2,3.59,-0.2,...,-0.2,0.49,-0.2,-0.2,-0.2,-0.2,-0.2,-0.2,2,1
7217,641609,-,-,-,-,Cc1ccc(cc1)S(=O)(=O)N\N=C(/C(O)c2ccc(cc2)C(O)\...,0,0,0,0,...,-0.98,-0.4,0.39,0,-0.98,0,-0.98,0.07,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14493,699925,"9-((2-Chloroethyl)thio)-2,7-dimethoxyacridine",-,-,395389,COc1ccc2nc3ccc(OC)cc3c(SCCCl)c2c1,1.38,-0.58,0.77,-0.84,...,0.72,-0.7,0.01,1.65,0.12,0.62,-1.05,-1.17,3,2
4203,382770,(E)-4-Bromo-1-(4-methoxyphenyl)-4-methylpent-1...,-,-,5798653,COc1ccc(\C=C\C(=O)C(C)(C)Br)cc1,0.51,-0.99,-0.76,1.57,...,0.92,-0.99,-0.06,-0.83,1.92,0.54,-0.64,0.7,1,1
15726,709568,N-(3-Fluorophenyl)-2-(3-fluorophenyl)imino-7-h...,-,-,135426697,Oc1ccc2C=C(C(=O)Nc3cccc(F)c3)\C(=N\c4cccc(F)c4...,0.59,0,0.75,0.26,...,-0.28,-1.86,0.25,-1.01,0,0.03,0,0.37,1,1
11406,676870,(2E)-1-(Benzenesulfonyl)-2-[(4-nitrophenyl)met...,-,-,5468824,[O-][N+](=O)c1ccc(\C=C/2\N(c3ncccc3C2=O)S(=O)(...,-0.55,-0.16,-0.76,-0.28,...,-0.18,0,-0.15,0.46,-0.18,-0.2,-0.24,-0.32,1,1


In [None]:
# rna_df에서 변동성이 큰 유전자 전처리 (상위 1000개))
def preprocess_rna_data(rna_df, n_genes=1000):
    # 필요한 컬럼만 선택 (Gene name d를 인덱스로, 세포주 발현량 데이터만 선택)
    # BR:MCF7, BR:MDA-MB-231 등의 세포주 컬럼만 선택 (Cytoband 이후 컬럼들)
    rna_processed = rna_df.copy()
    gene_names = rna_processed['Gene name d']
    expression_data = rna_processed.iloc[:, 7:]  # Cytoband 이후의 모든 컬럼

    # 데이터프레임 재구성
    expression_data.index = gene_names

    # 숫자로 변환 가능한 데이터만 선택하여 표준편차 계산
    numeric_data = expression_data.apply(pd.to_numeric, errors='coerce')
    gene_std = numeric_data.std(axis=1)

    # 결측치 제거
    gene_std = gene_std.dropna()

    # 표준편차가 큰 상위 n_genes개 유전자 선택
    top_variable_genes = gene_std.nlargest(n_genes).index
    rna_filtered = numeric_data.loc[top_variable_genes]

    print(f"원본 유전자 수: {len(rna_df)}")
    print(f"필터링된 유전자 수: {len(rna_filtered)}")

    return rna_filtered

# 전처리 실행
rna_processed = preprocess_rna_data(rna_df)

# 결과 확인
print("\n처리된 데이터 형태:", rna_processed.shape)
print("\n처리된 데이터 샘플:")
rna_processed

원본 유전자 수: 23808
필터링된 유전자 수: 1000

처리된 데이터 형태: (1000, 59)

처리된 데이터 샘플:


Unnamed: 0_level_0,BR:MDA-MB-231,BR:HS 578T,BR:BT-549,BR:T-47D,CNS:SF-268,CNS:SF-295,CNS:SF-539,CNS:SNB-19,CNS:SNB-75,CNS:U251,...,PR:PC-3,PR:DU-145,RE:786-0,RE:A498,RE:ACHN,RE:CAKI-1,RE:RXF 393,RE:SN12C,RE:TK-10,RE:UO-31
Gene name d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MIR5047,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,14.421,15.832,...,0.000,0.000,0.000,0.000,15.010,0.000,0.000,0.000,0.000,0.000
SNORD104,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
KRT8,2.982,2.232,0.535,10.277,2.938,1.814,1.921,1.214,0.234,6.490,...,6.955,7.118,7.261,6.823,8.383,7.504,1.865,8.338,6.570,9.417
MIR142,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
UCHL1,0.000,0.952,6.733,1.278,6.276,8.970,2.407,8.030,4.228,8.771,...,7.127,7.232,5.711,5.400,0.000,7.736,5.402,8.252,7.132,7.810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TST,2.996,2.201,4.652,3.934,2.669,4.214,2.395,2.964,3.045,3.440,...,3.173,2.773,2.957,2.579,4.053,1.833,2.196,0.000,2.410,2.226
MGST2,4.214,0.333,0.000,2.569,0.986,0.685,1.229,2.545,1.040,3.331,...,2.531,1.071,0.739,3.050,2.506,2.263,2.004,0.310,1.230,3.092
PI3,0.000,0.000,0.000,0.000,0.238,0.333,0.205,0.000,0.294,0.258,...,8.129,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
ENHO,0.000,1.918,1.358,0.425,2.213,1.268,0.111,2.623,1.398,1.409,...,0.507,1.483,3.471,0.843,0.287,1.905,0.000,0.306,0.626,0.469


In [None]:
import numpy as np
# drug_df에서 "SMILES d" 컬럼 선택하고 ECFP로 인코딩
def smiles_to_ecfp(smiles, radius=2, nBits=1024):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))
        return [0] * nBits
    except:
        return [0] * nBits

# drug_df에서 SMILES 컬럼 선택
smiles_data = drug_df['SMILES d'].fillna('')
# 각 SMILES 문자열을 ECFP로 변환
ecfp_features = np.array([smiles_to_ecfp(s) for s in smiles_data])
print("ECFP 특성 행렬 형태:", ecfp_features.shape)

[06:54:25] Explicit valence for atom # 0 Br, 2, is greater than permitted
[06:54:25] SMILES Parse Error: syntax error while parsing: -
[06:54:25] SMILES Parse Error: Failed parsing SMILES '-' for input: '-'
[06:54:25] SMILES Parse Error: syntax error while parsing: -
[06:54:25] SMILES Parse Error: Failed parsing SMILES '-' for input: '-'
[06:54:25] SMILES Parse Error: syntax error while parsing: -
[06:54:25] SMILES Parse Error: Failed parsing SMILES '-' for input: '-'
[06:54:25] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[06:54:25] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[06:54:25] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[06:54:26] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[06:54:26] Explicit valence for atom # 3 Cl, 3, is greater than permitted
[06:54:26] Explicit valence for atom # 14 N, 6, is greater than permitted
[06:54:26] Explicit valence for atom # 6 Cl, 3, is greater than permitted


ECFP 특성 행렬 형태: (1000, 1024)


[06:54:26] SMILES Parse Error: syntax error while parsing: -
[06:54:26] SMILES Parse Error: Failed parsing SMILES '-' for input: '-'
[06:54:26] Explicit valence for atom # 7 O, 3, is greater than permitted
[06:54:26] Explicit valence for atom # 24 Cl, 3, is greater than permitted
[06:54:26] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[06:54:26] Explicit valence for atom # 10 B, 7, is greater than permitted
[06:54:26] SMILES Parse Error: syntax error while parsing: -
[06:54:26] SMILES Parse Error: Failed parsing SMILES '-' for input: '-'
[06:54:26] Explicit valence for atom # 7 Ga, 9, is greater than permitted


In [None]:
# RNA 데이터 전치 (세포주가 행으로 오도록)
rna_t = rna_processed.T

# 각 약물-세포주 쌍에 대한 특성 벡터와 반응값 생성
X_list = []
y_list = []

# drug_df에서 세포주 반응값 컬럼만 선택 (SMILES d 이후 컬럼들)
response_cols = drug_df.columns[6:]

# 각 약물에 대해
for drug_idx in range(len(drug_df)):
    drug_ecfp = ecfp_features[drug_idx]
    drug_responses = drug_df.iloc[drug_idx, 6:].values  # SMILES d 이후의 반응값

    # 각 세포주에 대해
    for cell_idx, cell_line in enumerate(rna_t.index):
        # 세포주의 유전자 발현값
        gene_expr = rna_t.iloc[cell_idx].values

        # 약물 ECFP와 유전자 발현을 결합
        combined_features = np.concatenate([drug_ecfp, gene_expr])
        X_list.append(combined_features)

        # 해당 약물-세포주 쌍의 반응값
        y_list.append(drug_responses[cell_idx])

# 리스트를 numpy 배열로 변환
X = np.array(X_list)
y = np.array(y_list)

# 결과 출력
print("X shape:", X.shape)  # (약물 수 * 세포주 수, ECFP 차원 + 유전자 수)
print("y shape:", y.shape)  # (약물 수 * 세포주 수,)


X shape: (59000, 2024)
y shape: (59000,)


In [None]:
# 학습 데이터와 테스트 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((47200, 2024), (11800, 2024), (47200,), (11800,))

In [None]:
# 모델 훈련
import lightgbm as lgb
model = lgb.LGBMRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1  # 모든 CPU 코어 사용
)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.097016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52800
[LightGBM] [Info] Number of data points in the train set: 47200, number of used features: 2024
[LightGBM] [Info] Start training from score 0.000161


In [None]:
# 예측
y_pred = model.predict(X_test)
# y_test float으로 변환
y_test = y_test.astype(float)

# 평가
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.4f}')

Mean Squared Error: 0.7398
