In [1]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import deepchem as dc

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/HDD1/bbq9088/miniconda3/envs/molberta/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


In [2]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [3]:
# 로컬에서 Roberta 모델과 Tokenizer 로드
tokenizer_path = "./origin_model/roberta/tokenizer_folder"
model_path = "./origin_model/roberta"

In [4]:
# 토크나이저와 모델 로드
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=2).to(device)
model.eval()

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at ./origin_model/roberta were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./origin_model/roberta and are newly initialized: ['classifier.dense.weight', 'classifier.dense.b

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [5]:
def load_dataset(dataset_name, featurizer):
    if dataset_name == "ClinTox":
        tasks, datasets, transformers = dc.molnet.load_clintox(featurizer=featurizer, splitter=None, transformers=[], reload=True)
        dataset = datasets[0]
        df = pd.DataFrame({
            'smiles': dataset.ids,
            'FDA_APPROVED': dataset.y[:, 0],
            'CT_TOX': dataset.y[:, 1]
        }).dropna()
        return df

    elif dataset_name == "SIDER":
        tasks, datasets, transformers = dc.molnet.load_sider(featurizer=featurizer, splitter=None, transformers=[], reload=True)
        dataset = datasets[0]
        df = pd.DataFrame(data=dataset.y, columns=tasks)
        df['smiles'] = dataset.ids
        df = df.dropna()
        return df

    elif dataset_name == "Tox21":
        tasks, datasets, transformers = dc.molnet.load_tox21(featurizer=featurizer, splitter=None, transformers=[], reload=True)
        dataset = datasets[0]
        df = pd.DataFrame(data=dataset.y, columns=tasks)
        df['smiles'] = dataset.ids
        df = df.dropna()
        return df

    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")

In [6]:
def prepare_data(df):
    texts = [f"SMILES: {smiles}" for smiles in df['smiles']]
    labels = df.iloc[:, 1:].values  # 첫 번째 열 제외 (smiles)
    return texts, labels

In [7]:
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 레이블 검증: 숫자형이 아닌 경우 예외 발생
        if not isinstance(label, (np.ndarray, list)) or not np.issubdtype(label.dtype, np.number):
            raise ValueError(f"Invalid label format: {label} at index {idx}")

        inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        inputs['labels'] = torch.tensor(label, dtype=torch.float)  # 다중 레이블
        return {key: val.squeeze(0) for key, val in inputs.items()}

In [8]:
# 추론 및 평가 함수
def evaluate_dataset(dataset_name, featurizer, model, tokenizer, batch_size=16):
    print(f"\n=== Evaluating {dataset_name} ===")

    # 데이터셋 로드
    df = load_dataset(dataset_name, featurizer)

    # 데이터 준비
    texts, labels = prepare_data(df)

    # DataLoader 생성
    dataset = SMILESDataset(texts, labels)
    data_loader = DataLoader(dataset, batch_size=batch_size)

    all_preds, all_labels = [], []

    # 모델 추론
    with torch.no_grad():
        for batch in tqdm(data_loader, desc=f"Predicting {dataset_name}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            logits = outputs.logits  # [batch_size, num_labels]
            preds = torch.sigmoid(logits).cpu().numpy()  # Sigmoid 활성화 함수 적용
            labels = batch['labels'].cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    # 성능 평가
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    roc_auc = roc_auc_score(all_labels, all_preds, average='macro')
    f1_micro = f1_score(all_labels, all_preds > 0.5, average='micro')
    f1_macro = f1_score(all_labels, all_preds > 0.5, average='macro')

    print(f"ROC-AUC (Macro): {roc_auc:.4f}")
    print(f"F1-Score (Micro): {f1_micro:.4f}")
    print(f"F1-Score (Macro): {f1_macro:.4f}")

In [9]:
# Featurizer 설정 (CircularFingerprint 사용)
featurizer = dc.feat.CircularFingerprint(radius=2, size=2048)

# 모델과 토크나이저 로드
tokenizer = RobertaTokenizer.from_pretrained("./origin_model/roberta/tokenizer_folder")
model = RobertaForSequenceClassification.from_pretrained("./origin_model/roberta", num_labels=2).to(device)
model.eval()

# 데이터셋별 평가
for dataset_name in ["ClinTox",  "Tox21"]:
    evaluate_dataset(dataset_name, featurizer, model, tokenizer, batch_size=16)

Some weights of the model checkpoint at ./origin_model/roberta were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./origin_model/roberta and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.we


=== Evaluating ClinTox ===


Predicting ClinTox: 100%|██████████████████████████████████████████████████████████| 93/93 [00:04<00:00, 21.18it/s]


ROC-AUC (Macro): 0.3008
F1-Score (Micro): 0.9301
F1-Score (Macro): 0.4832

=== Evaluating Tox21 ===


Predicting Tox21:   0%|                                                                    | 0/489 [00:00<?, ?it/s]


ValueError: Invalid label format: [0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1'] at index 0