In [1]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, r2_score
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import deepchem as dc

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/HDD1/bbq9088/miniconda3/envs/molberta/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


In [2]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [3]:
# 로컬에서 Roberta 모델과 Tokenizer 로드
tokenizer_path = "./origin_model/roberta/tokenizer_folder"
model_path = "./origin_model/roberta"

In [4]:
# 토크나이저 로드
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

In [5]:
# RoBERTa 모델 로드 함수 (회귀용, num_labels=1 고정)
def load_model(model_path):
    return RobertaForSequenceClassification.from_pretrained(model_path, num_labels=1).to(device)

In [6]:
# 데이터 로드 함수
def load_datasets(featurizer):
    # FreeSolv
    tasks_freesolv, datasets_freesolv, _ = dc.molnet.load_freesolv(featurizer=featurizer, splitter=None, transformers=[], reload=True)
    dataset_freesolv = datasets_freesolv[0]
    df_freesolv = pd.DataFrame({'smiles': dataset_freesolv.ids, 'label': dataset_freesolv.y[:, 0]}).dropna()

    # Lipophilicity
    tasks_lipo, datasets_lipo, _ = dc.molnet.load_lipo(featurizer=featurizer, splitter=None, transformers=[], reload=True)
    dataset_lipo = datasets_lipo[0]
    df_lipo = pd.DataFrame({'smiles': dataset_lipo.ids, 'label': dataset_lipo.y[:, 0]}).dropna()

    # ESOL
    tasks_esol, datasets_esol, _ = dc.molnet.load_delaney(featurizer=featurizer, splitter=None, transformers=[], reload=True)
    dataset_esol = datasets_esol[0]
    df_esol = pd.DataFrame({'smiles': dataset_esol.ids, 'label': dataset_esol.y[:, 0]}).dropna()

    return {'FreeSolv': df_freesolv, 'Lipophilicity': df_lipo, 'ESOL': df_esol}

In [7]:
# 데이터 준비 함수
def prepare_data(df):
    texts = [f"SMILES: {smiles}" for smiles in df['smiles']]
    labels = df['label'].values.astype(np.float32)
    return texts, labels

In [8]:
# 커스텀 데이터셋 클래스
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        inputs['labels'] = torch.tensor(label, dtype=torch.float)
        return {key: val.squeeze(0) for key, val in inputs.items()}

In [9]:
# 회귀 평가 함수
def evaluate_regression(df, model, batch_size=16):
    texts, labels = prepare_data(df)

    dataset = SMILESDataset(texts, labels)
    data_loader = DataLoader(dataset, batch_size=batch_size)

    all_preds, all_labels = [], []

    # 모델 추론
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Predicting"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs.logits.squeeze(-1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # RMSE 및 R² 계산
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    rmse = np.sqrt(mean_squared_error(all_labels, all_preds))
    r2 = r2_score(all_labels, all_preds)

    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")
    return rmse, r2

In [10]:
# Featurizer 설정
featurizer = dc.feat.CircularFingerprint(radius=2, size=2048)

# 데이터셋 로드
datasets = load_datasets(featurizer)

In [11]:
# 모델 로드
model = load_model(model_path)
model.eval()

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at ./origin_model/roberta were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./origin_model/roberta and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [12]:
# 데이터셋별 평가
for dataset_name, df in datasets.items():
    print(f"\n=== Evaluating {dataset_name} ===")
    evaluate_regression(df, model)


=== Evaluating FreeSolv ===


Predicting: 100%|██████████████████████████████████████████████████████████████████| 41/41 [00:03<00:00, 11.53it/s]


RMSE: 1.0704
R²: -0.1458

=== Evaluating Lipophilicity ===


Predicting: 100%|████████████████████████████████████████████████████████████████| 263/263 [00:21<00:00, 11.96it/s]


RMSE: 2.1468
R²: -2.1852

=== Evaluating ESOL ===


Predicting: 100%|██████████████████████████████████████████████████████████████████| 71/71 [00:05<00:00, 12.11it/s]

RMSE: 4.0285
R²: -2.6959



