In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from math import sqrt
import os

#properties 계산할 때 필요한 라이브러리
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski, rdMolDescriptors, Fragments

In [None]:
class SMILESDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32)

In [None]:
class Molecular_Properties_Dataset(Dataset):
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
        self.smiles_data = self.df['Smiles'].tolist()
        self.properties_features = np.array([self.calculate_properties(smiles) for smiles in self.smiles_data])

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.properties_features[idx])

    @staticmethod
    def calculate_properties(smiles):
        mol = Chem.MolFromSmiles(smiles)
        
        properties = []
        
        properties.append(round(Descriptors.MolWt(mol), 6))  # 분자량 (Molecular Weight)
        properties.append(round(Crippen.MolLogP(mol), 6))  # Crippen의 방식으로 계산된 로그 P 값 (LogP)
        properties.append(round(Descriptors.TPSA(mol), 6))  # 극성 표면적 (Topological Polar Surface Area)
        properties.append(round(Lipinski.NumHAcceptors(mol), 6))  # 수소 수용체의 개수 (Number of Hydrogen Bond Acceptors)
        properties.append(round(Lipinski.NumHDonors(mol), 6))  # 수소 공여체의 개수 (Number of Hydrogen Bond Donors)
        properties.append(round(Lipinski.NumRotatableBonds(mol), 6))  # 회전 가능한 결합의 수 (Number of Rotatable Bonds)
        properties.append(round(Chem.GetFormalCharge(mol), 6))  # 분자의 형식적 전하 (Formal Charge)
        properties.append(round(rdMolDescriptors.CalcNumAtomStereoCenters(mol), 6))  # 원자 중심 입체 중심 수 (Number of Atom Stereocenters)
        properties.append(round(rdMolDescriptors.CalcFractionCSP3(mol), 6))  # 탄소 sp3 부분의 분율 (Fraction of sp3 Carbon Atoms)
        properties.append(round(Descriptors.NumAliphaticCarbocycles(mol), 6))  # 지방족 탄소고리의 수 (Number of Aliphatic Carbocycles)
        properties.append(round(Descriptors.NumAromaticRings(mol), 6))  # 방향족 고리의 수 (Number of Aromatic Rings)
        properties.append(round(Descriptors.NumHeteroatoms(mol), 6))  # 헤테로 원자의 수 (Number of Heteroatoms)
        properties.append(round(Fragments.fr_COO(mol), 6))  # 카복실산 작용기의 수 (Number of Carboxylic Acid Groups)
        properties.append(round(Fragments.fr_Al_OH(mol), 6))  # 알코올 작용기의 수 (Number of Aliphatic Alcohol Groups)
        properties.append(round(Fragments.fr_alkyl_halide(mol), 6))  # 알킬 할라이드 작용기의 수 (Number of Alkyl Halide Groups)
        properties.append(round(Descriptors.NumAromaticCarbocycles(mol), 6))  # 방향족 탄소고리의 수 (Number of Aromatic Carbocycles)
        properties.append(round(Fragments.fr_piperdine(mol), 6))  # 피페리딘 작용기의 수 (Number of Piperidine Groups)
        properties.append(round(Fragments.fr_methoxy(mol), 6))  # 메톡시 작용기의 수 (Number of Methoxy Groups)

        return properties

In [None]:
class Molecular_Properties_Model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Molecular_Properties_Model, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu1 = nn.ReLU()
        
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU()
        
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.relu3 = nn.ReLU()
        
        self.fc4 = nn.Linear(32, output_dim)
        self.bn4 = nn.BatchNorm1d(output_dim)
    
    def forward(self, x):
        x = self.relu1(self.bn1(self.fc1(x)))
        x = self.relu2(self.bn2(self.fc2(x)))
        x = self.relu3(self.bn3(self.fc3(x)))
        x = self.bn4(self.fc4(x))
        return x

In [None]:
class RegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(RegressionModel, self).__init__()
        
        # 첫 번째 레이어 블록
        self.fc1 = nn.Linear(input_dim, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        self.relu1 = nn.ReLU()
        self.drop1 = nn.Dropout(p=0.3)  # 높은 차원의 데이터 처리에 더 강한 드롭아웃

        # 두 번째 레이어 블록
        self.fc2 = nn.Linear(2048, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.relu2 = nn.ReLU()
        self.drop2 = nn.Dropout(p=0.25)  # 적당한 드롭아웃

        # 세 번째 레이어 블록
        self.fc3 = nn.Linear(1024, 512)
        self.bn3 = nn.BatchNorm1d(512)
        self.relu3 = nn.ReLU()
        self.drop3 = nn.Dropout(p=0.2)  # 적당한 드롭아웃

        # 네 번째 레이어 블록
        self.fc4 = nn.Linear(512, 256)
        self.bn4 = nn.BatchNorm1d(256)
        self.relu4 = nn.ReLU()
        self.drop4 = nn.Dropout(p=0.15)  # 더 낮은 드롭아웃

        # 다섯 번째 레이어 블록
        self.fc5 = nn.Linear(256, 128)
        self.bn5 = nn.BatchNorm1d(128)
        self.relu5 = nn.ReLU()
        self.drop5 = nn.Dropout(p=0.1)  # 가장 낮은 드롭아웃

        # 최종 출력 레이어
        self.fc6 = nn.Linear(128, output_dim)
    
    def forward(self, x):
        x = self.drop1(self.relu1(self.bn1(self.fc1(x))))
        x = self.drop2(self.relu2(self.bn2(self.fc2(x))))
        x = self.drop3(self.relu3(self.bn3(self.fc3(x))))
        x = self.drop4(self.relu4(self.bn4(self.fc4(x))))
        x = self.drop5(self.relu5(self.bn5(self.fc5(x))))
        x = self.fc6(x)
        return x

In [None]:
def rmse_loss(output, target):
    return sqrt(((output - target) ** 2).mean())
def train_model(features, target, model, device):
    optimizer = optim.AdamW(model.parameters(), lr=0.001)
    num_epochs = 10

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for features, targets in train_loader:
            features, targets = features.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = rmse_loss(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * features.size(0)


        train_loss /= len(train_loader.dataset)


        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in val_loader:
                features, targets = features.to(device), targets.to(device)
                outputs = model(features)
                loss = rmse_loss(outputs, targets)
                val_loss += loss.item() * features.size(0)


        val_loss /= len(val_loader.dataset)

        print(f'Epoch {epoch+1}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')


In [None]:
file_path = os.path.join('train_data')
csv_file = os.path.join(file_path, 'train_data.csv')

# CSV 파일을 불러와서 각 열을 분리
df = pd.read_csv(csv_file)

# 각 열을 리스트로 변환
smiles_data = df['Smiles'].tolist()
image_feature_data = df['image_feature_vector'].tolist()
protein_embedding_data = df['target_protein_vector'].tolist()

# 각 데이터셋 생성
smiles_dataset = CustomDataset(smiles_data)
image_feature_dataset = CustomDataset(image_feature_data)
protein_embedding_dataset = CustomDataset(protein_embedding_data)
molecular_properties_dataset = Molecular_Properties_Dataset(smiles_data)

# DataLoader 생성
batch_size = 32  # 배치 크기 설정

smiles_loader = DataLoader(smiles_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
image_feature_loader = DataLoader(image_feature_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
protein_embedding_loader = DataLoader(protein_embedding_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
molecular_properties_loader = DataLoader(molecular_properties_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
regression_model = RegressionModel(input_dim=combined_features.shape[1], output_dim=1).to(device)
target = torch.randn(features.shape[0], device=device)  # 실제 타겟 데이터 로드 필요

train_model(combined_features, target, regression_model, device)