In [1]:
import warnings

# 모든 경고 무시
warnings.filterwarnings("ignore")

In [2]:
import os
import numpy as np
import pandas as pd
import deepchem as dc
import torch
from tqdm import tqdm
from tqdm.notebook import tqdm
from deepchem.feat.molecule_featurizers import CircularFingerprint
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from skmultilearn.model_selection import IterativeStratification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski, Fragments, rdMolDescriptors

# 로드된 Roberta 모델과 Tokenizer
from transformers import RobertaTokenizer, RobertaModel

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/HDD1/bbq9088/miniconda3/envs/molberta/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


In [3]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [4]:
# 로드한 RoBERTa 모델
tokenizer = RobertaTokenizer.from_pretrained("./origin_model/roberta/tokenizer_folder")
model = RobertaModel.from_pretrained("./origin_model/roberta").to(device)

Some weights of the model checkpoint at ./origin_model/roberta were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# Featurizer 설정 (CircularFingerprint 사용)
featurizer = CircularFingerprint(radius=2, size=2048)

In [6]:
# 데이터셋 로드 (ClinTox, SIDER, Tox21)
tasks_clintox, datasets_clintox, transformers_clintox = dc.molnet.load_clintox(featurizer=featurizer, splitter=None, transformers=[], reload=True)
dataset_clintox = datasets_clintox[0]
df_clintox = pd.DataFrame({'smiles': dataset_clintox.ids, 'FDA_APPROVED': dataset_clintox.y[:, 0], 'CT_TOX': dataset_clintox.y[:, 1]}).dropna()

tasks_sider, datasets_sider, transformers_sider = dc.molnet.load_sider(featurizer=featurizer, splitter=None, transformers=[], reload=True)
dataset_sider = datasets_sider[0]
df_sider = pd.DataFrame(data=dataset_sider.y, columns=tasks_sider)
df_sider['smiles'] = dataset_sider.ids
df_sider = df_sider.dropna()

tasks_tox21, datasets_tox21, transformers_tox21 = dc.molnet.load_tox21(featurizer=featurizer, splitter=None, transformers=[], reload=True)
dataset_tox21 = datasets_tox21[0]
df_tox21 = pd.DataFrame(data=dataset_tox21.y, columns=tasks_tox21)
df_tox21['smiles'] = dataset_tox21.ids
df_tox21 = df_tox21.dropna()

In [7]:
# SMILES에 대한 입력 문장 생성 함수
#def create_input_text(smiles):
#    return f"SMILES: {smiles}"

In [8]:
# SMILES에 대한 분자 특성 계산 함수
def calculate_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    
    properties = []
    try:
        properties.append(Descriptors.MolWt(mol) if mol else None)
        properties.append(Crippen.MolLogP(mol) if mol else None)
        properties.append(Descriptors.TPSA(mol) if mol else None)
        properties.append(Lipinski.NumHAcceptors(mol) if mol else None)
        properties.append(Lipinski.NumHDonors(mol) if mol else None)
        properties.append(Lipinski.NumRotatableBonds(mol) if mol else None)
        properties.append(Chem.GetFormalCharge(mol) if mol else None)
        properties.append(rdMolDescriptors.CalcNumAtomStereoCenters(mol) if mol else None)
        properties.append(rdMolDescriptors.CalcFractionCSP3(mol) if mol else None)
        properties.append(Descriptors.NumAliphaticCarbocycles(mol) if mol else None)
        properties.append(Descriptors.NumAromaticRings(mol) if mol else None)
        properties.append(Descriptors.NumHeteroatoms(mol) if mol else None)
        properties.append(Fragments.fr_COO(mol) if mol else None)
        properties.append(Fragments.fr_Al_OH(mol) if mol else None)
        properties.append(Fragments.fr_alkyl_halide(mol) if mol else None)
        properties.append(Descriptors.NumAromaticCarbocycles(mol) if mol else None)
        properties.append(Fragments.fr_piperdine(mol) if mol else None)
        properties.append(Fragments.fr_methoxy(mol) if mol else None)
    except Exception as e:
        print(f"Warning: Could not calculate properties for SMILES: {smiles}. Error: {e}")
        return [None] * 18  # 오류 발생 시 모든 값을 None으로 반환
    
    return properties

In [9]:
def create_input_text(smiles):
    properties = calculate_properties(smiles)
    
    property_names = [
        "Molecular Weight", "LogP", "Topological Polar Surface Area", 
        "Number of Hydrogen Bond Acceptors", "Number of Hydrogen Bond Donors", 
        "Number of Rotatable Bonds", "Formal Charge", "Number of Atom Stereocenters", 
        "Fraction of sp3 Carbon Atoms", "Number of Aliphatic Carbocycles", 
        "Number of Aromatic Rings", "Number of Heteroatoms", "Number of Carboxylic Acid Groups", 
        "Number of Aliphatic Alcohol Groups", "Number of Alkyl Halide Groups", 
        "Number of Aromatic Carbocycles", "Number of Piperidine Groups", 
        "Number of Methoxy Groups"
    ]
    
    # None 값이 아닌 속성만 포함
    properties_text = " | ".join(
        [f"{name}: {value:.5f}" for name, value in zip(property_names, properties) if value is not None]
    )
    
    input_text = f"SMILES: {smiles} | {properties_text}" if properties_text else f"SMILES: {smiles}"
    return input_text

In [10]:
# 임베딩 추출 함수 (RoBERTa 기반)
def get_embeddings(model, tokenizer, smiles_list):
    embeddings = []

    for smiles in tqdm(smiles_list, desc="Processing SMILES with Roberta"):
        input_text = create_input_text(smiles)
        
        # 입력 텍스트를 토큰화
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        # 모델에 입력하여 임베딩 추출
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()
        embeddings.append(embedding)

    return np.array(embeddings)

In [11]:
# K-겹 교차 검증을 사용한 다중 레이블/다중 출력 모델 학습 및 평가
def train_and_evaluate_kfold(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    f1_micro_scores = []
    f1_macro_scores = []
    auc_roc_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = MultiOutputClassifier(LogisticRegression(max_iter=500, solver='lbfgs', penalty='l2'))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = np.array([estimator.predict_proba(X_test)[:, 1] for estimator in model.estimators_]).T
        
        # Calculate F1-scores
        f1_micro = f1_score(y_test, y_pred, average='micro')
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_micro_scores.append(f1_micro)
        f1_macro_scores.append(f1_macro)
        
        # Calculate AUC-ROC score for each label and compute the mean
        aucrocs = []
        for i in range(y_test.shape[1]):
            if len(np.unique(y_test[:, i])) > 1:  # Only if both classes are present
                aucrocs.append(roc_auc_score(y_test[:, i], y_pred_proba[:, i]))
        auc_roc_mean = np.mean(aucrocs) if aucrocs else float('nan')
        auc_roc_scores.append(auc_roc_mean)

    f1_micro_mean = np.mean(f1_micro_scores)
    f1_macro_mean = np.mean(f1_macro_scores)
    auc_roc_mean_overall = np.nanmean(auc_roc_scores)  # Handle NaN values gracefully
    f1_micro_std = np.std(f1_micro_scores)
    f1_macro_std = np.std(f1_macro_scores)
    auc_roc_std = np.nanstd(auc_roc_scores)

    print(f"Average F1-score (Micro): {f1_micro_mean:.4f} ± {f1_micro_std:.4f}")
    print(f"Average F1-score (Macro): {f1_macro_mean:.4f} ± {f1_macro_std:.4f}")
    print(f"Average AUC-ROC score: {auc_roc_mean_overall:.4f} ± {auc_roc_std:.4f}")

    return f1_micro_scores, f1_macro_scores, auc_roc_scores

In [12]:
# 데이터셋별 임베딩 생성 및 학습/평가
embedding_dir = os.path.join(os.getcwd(), 'Embedding')
result_dir = os.path.join(os.getcwd(), 'Results')
os.makedirs(embedding_dir, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)

In [13]:
for dataset_name, df in [("ClinTox", df_clintox), ("SIDER", df_sider), ("Tox21", df_tox21)]:
    print(f"\n=== Processing {dataset_name} ===")

    # 임베딩 생성
    embeddings = get_embeddings(model, tokenizer, df['smiles'].tolist())

    # 레이블 준비
    labels = df.drop(columns=['smiles']).values

    # K-겹 교차 검증을 통한 학습 및 평가
    print(f"=== Training and evaluating on {dataset_name} ===")
    f1_micro_scores, f1_macro_scores, auc_roc_scores = train_and_evaluate_kfold(embeddings, labels)

    # 결과 요약
    f1_micro_mean = np.mean(f1_micro_scores)
    f1_macro_mean = np.mean(f1_macro_scores)
    auc_roc_mean = np.nanmean(auc_roc_scores)  # NaN 처리

    f1_micro_std = np.std(f1_micro_scores)
    f1_macro_std = np.std(f1_macro_scores)
    auc_roc_std = np.nanstd(auc_roc_scores)

    print(f"{dataset_name} - F1-score (Micro): {f1_micro_mean:.4f} ± {f1_micro_std:.4f}")
    print(f"{dataset_name} - F1-score (Macro): {f1_macro_mean:.4f} ± {f1_macro_std:.4f}")
    print(f"{dataset_name} - AUC-ROC: {auc_roc_mean:.4f} ± {auc_roc_std:.4f}")

    # 결과 저장
    result_dir = os.path.join(os.getcwd(), 'Results')
    os.makedirs(result_dir, exist_ok=True)
    result_file = os.path.join(result_dir, f"{dataset_name}_metrics.txt")
    with open(result_file, 'w') as f:
        f.write(f"Dataset: {dataset_name}\n")
        f.write(f"F1-score (Micro): {f1_micro_mean:.4f} ± {f1_micro_std:.4f}\n")
        f.write(f"F1-score (Macro): {f1_macro_mean:.4f} ± {f1_macro_std:.4f}\n")
        f.write(f"AUC-ROC: {auc_roc_mean:.4f} ± {auc_roc_std:.4f}\n")

    print(f"Metrics for {dataset_name} saved to {result_file}")


=== Processing ClinTox ===


Processing SMILES with Roberta:   0%|          | 0/1480 [00:00<?, ?it/s]

=== Training and evaluating on ClinTox ===
Average F1-score (Micro): 0.9776 ± 0.0090
Average F1-score (Macro): 0.9118 ± 0.0306
Average AUC-ROC score: 0.9790 ± 0.0243
ClinTox - F1-score (Micro): 0.9776 ± 0.0090
ClinTox - F1-score (Macro): 0.9118 ± 0.0306
ClinTox - AUC-ROC: 0.9790 ± 0.0243
Metrics for ClinTox saved to /HDD1/bbq9088/GPT-MolBERTa/model_predict/Results/ClinTox_metrics.txt

=== Processing SIDER ===


Processing SMILES with Roberta:   0%|          | 0/1427 [00:00<?, ?it/s]



=== Training and evaluating on SIDER ===
Average F1-score (Micro): 0.7942 ± 0.0058
Average F1-score (Macro): 0.6065 ± 0.0048
Average AUC-ROC score: 0.6178 ± 0.0088
SIDER - F1-score (Micro): 0.7942 ± 0.0058
SIDER - F1-score (Macro): 0.6065 ± 0.0048
SIDER - AUC-ROC: 0.6178 ± 0.0088
Metrics for SIDER saved to /HDD1/bbq9088/GPT-MolBERTa/model_predict/Results/SIDER_metrics.txt

=== Processing Tox21 ===


Processing SMILES with Roberta:   0%|          | 0/7823 [00:00<?, ?it/s]



=== Training and evaluating on Tox21 ===
Average F1-score (Micro): 0.1342 ± 0.0212
Average F1-score (Macro): 0.1280 ± 0.0195
Average AUC-ROC score: 0.7503 ± 0.0085
Tox21 - F1-score (Micro): 0.1342 ± 0.0212
Tox21 - F1-score (Macro): 0.1280 ± 0.0195
Tox21 - AUC-ROC: 0.7503 ± 0.0085
Metrics for Tox21 saved to /HDD1/bbq9088/GPT-MolBERTa/model_predict/Results/Tox21_metrics.txt
