ChemBERT를 backbone으로 사용 https://huggingface.co/models?sort=downloads&search=chemBERT  
아이디어: GNN대신 트랜스포머계열을 활용해서 graph representation을 획득하는 (매우단순한) 모델  
https://dacon.io/competitions/official/236127/codeshare/8812?page=1&dtype=recent

In [1]:
# molecule predictor 
import torch
import torch.nn as nn 
import torch.nn.functional as F

from transformers import AutoModelForSequenceClassification
import chem # 직접 작성한 chem.py 파일 추가함


class ChemBERT(nn.Module):
    def __init__(self, BERT_out_dim, projection_dim, out_dim) -> None:
        super(ChemBERT, self).__init__()
        self.ChemBERT_encoder = AutoModelForSequenceClassification.from_pretrained(
            chem.chosen, num_labels=BERT_out_dim, problem_type="multi_label_classification"
        )
        
        # (classifier): RobertaClassificationHead(
        #                 (dense): Linear(in_features=384, out_features=384, bias=True)
        #                 (dropout): Dropout(p=0.144, inplace=False)
        #                 (out_proj): Linear(in_features=384, out_features=BERT_out_dim, bias=True)
        #                 )
        
        self.projection = nn.Linear(in_features=projection_dim, out_features=projection_dim)
        self.ln = nn.LayerNorm(normalized_shape=projection_dim)
        self.out = nn.Linear(in_features=projection_dim, out_features=out_dim)

        self.act = nn.GELU()
        self.drop = nn.Dropout(0.144)
        
    def forward(self, batch):
        enc_out = self.ChemBERT_encoder(batch.input_ids).logits

        h = torch.concat([enc_out, batch.mol_f], dim=1)
        h = self.projection(h)
        h = self.ln(h)
        h = self.act(h)
        h = self.drop(h)
        h = self.out(h)
        
        return h

2023-09-25 00:09:56.861085: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No normalization for AvgIpc. Feature removed!
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


## Custom Dataset(PyG)과 Pytorch_lighting을 활용한 Dataloader

`chem` 에서 생성된 atomic & molecular feature를 dataset 형태로 만드는 코드

- dataset in PyG
    - using `torch_geometric`
    - `batch.x (atomic feature) batch.edge_index(molecular bonds)`를 통해서 GCN forward에 제공하면 작동
    - e.g. `h1 = self.conv1(g.x, g.edge_index)`

- pl.LightningDataModule
    - Dataloader를 만드는 클래스
    - KFold를 상정하고 코드를 작성, torch-lightning 을 사용해서 더 간단하게 사용할 수 있음
    - 학습데이터에 존재하는 LogP 결측치는 rdkit 패키지의 LogP 값으로 대체함
     

In [2]:
from typing import Callable, Optional, Union

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold

import torch 
from torch.utils.data import Dataset

from torch_geometric.data import Data
from torch_geometric.data.dataset import IndexType
from torch_geometric.loader import DataLoader

import pytorch_lightning as pl

from chem import Chemical_feature_generator
from rdkit.Chem import PandasTools

In [3]:
feature_label = [ 'MolWt', 'HeavyAtomMolWt',
                    'NumValenceElectrons', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount',
                    'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
                    'NumAliphaticRings', 'NumAromaticCarbocycles',
                    'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors',
                    'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'RingCount',
                    'MolMR', 'CalcNumBridgeheadAtom', 'ExactMolWt', 
                    'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles',
                    'NumSaturatedRings', 'MolLogP', 'CalcNumAmideBonds',
                    'CalcNumSpiroAtoms',  
                    'num_ammonium_groups',  'num_alkoxy_groups'] # 29 


# given features가 이 모델에서 사용하는 tabular_features 임
given_features = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea'] # 7 
generator = Chemical_feature_generator()


class Chemical_dataset(Dataset):
    def __init__(self, data_frame: pd.DataFrame, fps, mol_f, transform = None, is_train = True):
        super().__init__()
        self.df = data_frame
        self.fps = fps
        self.mol_f = mol_f
        self.transform = transform

        self.is_train = is_train

    def __getitem__(self, idx: IndexType | int):
        return self.get_chem_prop(idx)
    
    def __len__(self) -> int:
        return self.df.shape[0]
    
    def get_chem_prop(self, idx):

        sample = self.df.iloc[idx]
        fingerprint = self.fps[idx]
        molecular_feature = self.mol_f[idx]
        smiles = sample["SMILES"]

        edge_index, edge_attr = generator.get_adj_matrix(smiles = smiles)
        atomic_feature = generator.generate_mol_atomic_features(smiles=smiles)
        input_ids = generator.encoder_smiles(smiles) # 384
        # ChemBERTa = ChemBERTa.detach()
        # molecular_feature = sample[feature_label] # if we use VarianceThreshold, then block this code

        if self.is_train:
            MLM = sample["MLM"]
            HLM = sample["HLM"]

        else:
            MLM = -99.
            HLM = -99.

        atomic_feature = torch.tensor(atomic_feature, dtype=torch.float)
        molecular_feature = torch.tensor(molecular_feature, dtype = torch.float).view(1,-1)
        fingerprint = torch.tensor(fingerprint, dtype=torch.float).view(1,-1)
        MLM = torch.tensor(MLM, dtype=torch.float).view(1,-1)
        HLM = torch.tensor(HLM, dtype=torch.float).view(1,-1)
        y = torch.concat([MLM, HLM], dim = 1)

        return Data(x=atomic_feature, mol_f = molecular_feature, fp=fingerprint,
         edge_index = edge_index, edge_attr = edge_attr, input_ids = input_ids, 
         y = y, MLM = MLM, HLM = HLM)

class KFold_pl_DataModule(pl.LightningDataModule):
    def __init__(self,
            train_df : str = "../input/new_train.csv",
            k_idx : int = 1, # fold index
            num_split : int = 5, # fold number, if k = 1 then return the whole data
            split_seed: int = 41,
            batch_size: int =1 ,
            num_workers:int =0,
            pin_memory:bool=False,
            persistent_workers: bool = True,
            train_transform = None,
            val_transform = None) -> None:

        super().__init__()
        persistent_workers = True if num_workers > 0 else False
        self.save_hyperparameters(logger=False)

        self.train_data = None
        self.val_data = None
        self.num_cls = 0

        self.setup()

    def setup(self, stage = None) -> None:
        if not self.train_data and not self.val_data:
            df = pd.read_csv(self.hparams.train_df, index_col = 0)

            mask = df['AlogP'] != df['AlogP']
            df.loc[mask, 'AlogP'] = df.loc[mask, 'MolLogP']

            # if we use rdkit fingerprint generators 
            # PandasTools.AddMoleculeColumnToFrame(df,'SMILES','Molecule')
            # df["FPs"] = df.Molecule.apply(generator.get_molecule_fingerprints)
            # train_fps = np.stack(df["FPs"])
            mol2vec = []

            for smiles in df.SMILES:
                vec = generator.get_mol_feature_from_deepchem(smiles = smiles)
                mol2vec.append(vec)

            mol2vec = np.concatenate(mol2vec, axis=0)

            scaler = preprocessing.StandardScaler()
            craft_mol_f = df[given_features].to_numpy()
            craft_mol_f = scaler.fit_transform(craft_mol_f)


            kf = KFold(n_splits = self.hparams.num_split,
                    shuffle=True,
                    random_state = self.hparams.split_seed)
            all_splits = [k for k in kf.split(df)]
            train_idx, val_idx = all_splits[self.hparams.k_idx]
            train_idx, val_idx = train_idx.tolist(), val_idx.tolist()

            train_df = df.iloc[train_idx]
            train_fp = mol2vec[train_idx]
            train_mol_f = craft_mol_f[train_idx]

            val_df = df.iloc[val_idx]
            val_fp = mol2vec[val_idx]
            val_mol_f = craft_mol_f[val_idx]

            self.train_data = Chemical_dataset(data_frame=train_df, fps=train_fp, mol_f=train_mol_f, transform=None, is_train=True)
            self.val_data = Chemical_dataset(data_frame=val_df, fps=val_fp, mol_f=val_mol_f, transform=None, is_train=True)

    def train_dataloader(self):
        return DataLoader(self.train_data,
                          batch_size=self.hparams.batch_size,
                          shuffle=True,
                          num_workers=self.hparams.num_workers,
                          persistent_workers=self.hparams.persistent_workers,
                          pin_memory=self.hparams.pin_memory,
                          drop_last=True)

    def val_dataloader(self):
        return DataLoader(self.val_data,
                          batch_size=self.hparams.batch_size,
                          shuffle=False,
                          num_workers=self.hparams.num_workers,
                          persistent_workers=self.hparams.persistent_workers,
                          pin_memory=self.hparams.pin_memory)
        

    

In [4]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=-1):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.min_delta = min_delta  # the minimum change to be counted as improvement
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf

    # return True when encountering _patience_ times decrease in validation loss 
    def __call__(self, validation_loss, verbose=False):
        if ((validation_loss+self.min_delta) < self.min_validation_loss):
            self.min_validation_loss = validation_loss
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
        elif ((validation_loss+self.min_delta) > self.min_validation_loss):
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if verbose:
                print(f"  >> now{validation_loss:.3f} > best{self.min_validation_loss:.3f}")
            if self.counter >= self.patience:
                return True
        return False

In [5]:
def train_MLM(train_loader, valid_loader, model, criterion, optimizer, scheduler,  epochs):

    earlyStop = EarlyStopping(patience= 8, min_delta=-10)


    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for batch in train_loader:
            optimizer.zero_grad() # Zero your gradients for every batch!
            
            output = model(batch.to("cuda"))
            loss = criterion(output, batch.MLM.to("cuda"))
            loss.backward()
            
            optimizer.step() # Adjust learning weights
            running_loss += loss.item()

        if epoch % 20 == 0:
            model.eval()
            valid_loss = 0
            with torch.no_grad():
                for batch in valid_loader:
                    output = model(batch.to("cuda"))
                    loss = criterion(output, batch.MLM.to("cuda"))
                    valid_loss += loss.item()
                 
                    
            print(f"Epoch: {epoch:4d}/{epochs} with lr {scheduler.get_last_lr()[0]:.9f}, Train Loss: {np.sqrt(running_loss/len(train_loader))}, Valid Loss: {np.sqrt(valid_loss/len(valid_loader))}")
            

            if earlyStop(valid_loss, verbose=True):
                break

            scheduler.step()    
        
    return model

In [6]:
def train_HLM(train_loader, valid_loader, model, criterion, optimizer, scheduler,  epochs):

    earlyStop = EarlyStopping(patience= 8, min_delta=-10)


    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for batch in train_loader:
            optimizer.zero_grad() # Zero your gradients for every batch!
            
            output = model(batch.to("cuda"))
            loss = criterion(output, batch.HLM.to("cuda"))
            loss.backward()
            
            optimizer.step() # Adjust learning weights
            running_loss += loss.item()

        if epoch % 20 == 0:
            model.eval()
            valid_loss = 0
            with torch.no_grad():
                for batch in valid_loader:
                    output = model(batch.to("cuda"))
                    loss = criterion(output, batch.HLM.to("cuda"))
                    valid_loss += loss.item()
                 
                    
            print(f"Epoch: {epoch:4d}/{epochs} with lr {scheduler.get_last_lr()[0]:.9f}, Train Loss: {np.sqrt(running_loss/len(train_loader))}, Valid Loss: {np.sqrt(valid_loss/len(valid_loader))}")
            

            if earlyStop(valid_loss, verbose=True):
                break

            scheduler.step()    
        
    return model

In [7]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 8000,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.0001}

BERT_param = {"BERT_out_dim": 100,
              "projection_dim": 100+len(given_features),
              "out_dim" : 1}  # {"BERT_out_dim" : bert 인코딩 출력 차원 , "projection_dim":  mlp에 들어가는 fc 레이어의 차원 (인코딩 + tabular features), "out_dim": 최종 출력 차원}

In [8]:
data = KFold_pl_DataModule()
model_MLM = ChemBERT(BERT_out_dim = BERT_param["BERT_out_dim"] , projection_dim= BERT_param["projection_dim"] , out_dim = BERT_param["out_dim"] ).to("cuda")
model_HLM = ChemBERT(BERT_out_dim = BERT_param["BERT_out_dim"] , projection_dim= BERT_param["projection_dim"] , out_dim = BERT_param["out_dim"] ).to("cuda")

criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_MLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_MLM, lr_lambda= lambda epoch : 0.95**(epoch))

optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_HLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_HLM, lr_lambda= lambda epoch : 0.95**(epoch))

#print(model_MLM, model_HLM)

In [None]:
model_MLM = train_MLM(data.train_dataloader(), data.val_dataloader(), model_MLM, criterion, optimizer_MLM, scheduler_MLM, epochs=CFG["EPOCHS"])
model_HLM = train_HLM(data.train_dataloader(), data.val_dataloader(), model_HLM, criterion, optimizer_HLM, scheduler_HLM, epochs=CFG["EPOCHS"])

In [None]:
def inference(test_loader, model, label_scaler=None):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            print(f"{idx:3d}th: ", end=" ")
            for d in inputs:``
                std = np.std(d.detach().numpy())
                print(f"{std} ", end=" ")
            print()
            output = model(batch.to("cuda"))
            if label_scaler is not None:
                output = label_scaler.inverse_transform(output.cpu())
            preds.extend(output.flatten().tolist())
    
    return preds

In [None]:
predictions_MLM = inference(test_MLM_loader, model_MLM)
predictions_HLM = inference(test_HLM_loader, model_HLM)

In [None]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

In [None]:
submission.to_csv('../output/submission.csv', index=False)
submission.describe()