In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski, rdMolDescriptors, Fragments
import numpy as np
import pandas as pd

In [2]:
# 특성 계산 함수
def calculate_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    
    properties = []
    
    properties.append(Descriptors.MolWt(mol))
    properties.append(Crippen.MolLogP(mol))
    properties.append(Descriptors.TPSA(mol))
    properties.append(Lipinski.NumHAcceptors(mol))
    properties.append(Lipinski.NumHDonors(mol))
    properties.append(Lipinski.NumRotatableBonds(mol))
    properties.append(Chem.GetFormalCharge(mol))
    properties.append(rdMolDescriptors.CalcNumAtomStereoCenters(mol))
    properties.append(rdMolDescriptors.CalcFractionCSP3(mol))
    properties.append(Descriptors.NumAliphaticCarbocycles(mol))
    properties.append(Descriptors.NumAromaticRings(mol))
    properties.append(Descriptors.NumHeteroatoms(mol))
    properties.append(Fragments.fr_COO(mol))
    properties.append(Fragments.fr_Al_OH(mol))
    properties.append(Fragments.fr_alkyl_halide(mol))
    properties.append(Descriptors.NumAromaticCarbocycles(mol))
    properties.append(Fragments.fr_piperdine(mol))
    properties.append(Fragments.fr_methoxy(mol))
    
    return properties

In [3]:
# SMILES 데이터를 읽어옵니다.
df = pd.read_csv('../../train_data/dacon_train.csv')
smiles_data = df['Smiles'].dropna()

print(smiles_data)

0        O=C(COc1ccc2c(=O)ccoc2c1)Nc1cccc(NS(=O)(=O)c2c...
1           O=C(Nc1cccc(-c2cnc3cc(-c4nncs4)ccn23)c1)NC1CC1
2        CC(C)Nc1cc(-n2ccc3cc(C#N)cnc32)ncc1C(=O)N[C@H]...
3        O=c1[nH]c([C@@H]2CNC[C@H]2c2ccccc2)nc2ccc(-c3c...
4        COc1cc2c(cnc3[nH]nc(C4CC4)c32)cc1-c1c(F)ccc(NS...
                               ...                        
39157    NC1(c2ccc(-c3nc4n(c3-c3ccccc3)COc3cccc(Cl)c3-4...
39158    CC(C)c1cccc(NC(=O)c2cccc(N3CCc4c(cncc4C(=O)NCC...
39159        CCNC(=O)Nc1nc2ccc(-c3ccccc3Oc3ccccc3)cc2[nH]1
39160    C=C1CNCCc2ccc(Nc3ncc(Cl)c(Nc4ccccc4C(=O)NC)n3)...
39161                O=C(Nc1cc(C2CCCC2)[nH]n1)c1ccc(Br)cc1
Name: Smiles, Length: 39145, dtype: object


In [4]:
features = np.array([calculate_properties(smiles) for smiles in smiles_data])

In [12]:
# 원본 데이터에서 결측치가 있는 행 확인
original_indices = df.index
non_null_indices = df['Smiles'].dropna().index

# 결측치로 인해 제거된 행의 인덱스 확인
removed_indices = original_indices.difference(non_null_indices)

# 제거된 행을 출력
removed_rows = df.loc[removed_indices]
print("제거된 행:")
print(removed_rows)

제거된 행:
      Smiles                  Target Name  Standard Value
940      NaN    Cyclin-dependent kinase 2           200.0
4586     NaN    Cyclin-dependent kinase 9             6.0
7909     NaN  Tyrosine-protein kinase ABL             7.4
9995     NaN    Cyclin-dependent kinase 9             3.0
10072    NaN    Cyclin-dependent kinase 2           320.0
10281    NaN    Cyclin-dependent kinase 9             5.0
10717    NaN    Cyclin-dependent kinase 9             6.0
14793    NaN  Tyrosine-protein kinase ABL             0.7
17148    NaN  Tyrosine-protein kinase ABL           169.0
20471    NaN  Tyrosine-protein kinase ABL             2.0
20702    NaN  Tyrosine-protein kinase ABL             1.7
21655    NaN    Cyclin-dependent kinase 2           170.0
23330    NaN    Cyclin-dependent kinase 9             5.0
25834    NaN  Tyrosine-protein kinase ABL             3.0
27375    NaN    Cyclin-dependent kinase 9            12.0
35827    NaN    Cyclin-dependent kinase 9             6.0
36523  

In [5]:
# 데이터 정규화
scaler = StandardScaler()
features = scaler.fit_transform(features)

In [6]:
# PyTorch Dataset 클래스 정의
class SMILESData(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float32)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx]

In [7]:
class AdvancedFNNModel(nn.Module):
    def __init__(self, input_dim):
        super(AdvancedFNNModel, self).__init__()
        
        # 첫 번째 층
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu1 = nn.ReLU()
        
        # 두 번째 층
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU()
        
        # 세 번째 층
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.relu3 = nn.ReLU()
        
        # 출력 층
        self.fc4 = nn.Linear(32, input_dim)  # 또는 output_dim으로 설정
        self.bn4 = nn.BatchNorm1d(input_dim)
    
    def forward(self, x):
        x = self.relu1(self.bn1(self.fc1(x)))
        x = self.relu2(self.bn2(self.fc2(x)))
        x = self.relu3(self.bn3(self.fc3(x)))
        x = self.bn4(self.fc4(x))  # 마지막 층에는 활성화 함수가 없을 수도 있습니다
        return x

In [8]:
# SMILESData 객체 생성
dataset = SMILESData(features)

In [9]:
# FNN 모델 객체 생성
input_dim = features.shape[1]
model = AdvancedFNNModel(input_dim)

In [10]:
# DataLoader로 피처 확인
data_loader = DataLoader(dataset, batch_size=1, shuffle=False)

# 피처 출력
for i, data in enumerate(data_loader):
    print(f"Feature {i + 1}: {data.numpy()}")

Feature 1: [[ 0.74267066  0.44299468  0.55779445 -0.17710938 -0.19387761  0.43488035
  -0.01839086 -0.52896667 -1.2336113  -0.4866239   0.5119662   1.1725851
  -0.10744621 -0.3788037   2.2518208   1.5940937  -0.41801056 -0.48841658]]
Feature 2: [[-0.7430053  -0.12266772 -0.37070757 -0.17710938 -0.19387761 -0.61247665
  -0.01839086 -0.52896667 -0.73441267  1.5383462   0.5119662  -0.32234946
  -0.10744621 -0.3788037  -0.44137877 -0.6224734  -0.41801056 -0.48841658]]
Feature 3: [[ 0.55555946  0.27411735  0.51943666  0.795117   -0.19387761  0.08576136
  -0.01839086 -0.52896667  0.6881406   1.5383462   0.5119662   0.42511785
  -0.10744621 -0.3788037   0.45635438 -1.730757   -0.41801056 -0.48841658]]
Feature 4: [[-0.9420522  -0.8207789  -0.30221152 -1.1493359   0.57637906 -0.96159565
  -0.01839086  1.593678   -0.5162754  -0.4866239   0.5119662  -1.0698168
  -0.10744621 -0.3788037  -0.44137877  0.48581013 -0.41801056 -0.48841658]]
Feature 5: [[ 0.62229896  1.0780643   0.01774115 -0.6632226  -

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [11]:
# 피처 벡터의 크기 출력
print(f"Feature size: {features.shape}")

Feature size: (39145, 18)
