In [1]:
import torch
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import wandb

In [14]:
!python3 -m wandb login eb7b1964fb84cd81de96b2a273ecf2bb6254aeac

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/alexeyorlov53/.netrc


In [15]:
model_name_base = 'graphormer-base-pcqm4mv1'
model_name = 'clefourrier/graphormer-base-pcqm4mv1'

In [16]:
batch_size = 2

### Upload Dataset

In [2]:
dataframe = pd.read_csv("data_10k_graph.csv")

In [6]:
print('Percentage on NaNs:')
dataframe.isna().mean()

Percentage on NaNs:


y             0.0
Smiles        0.0
ecfp1         0.0
ecfp2         0.0
ecfp3         0.0
node_feat     0.0
edge_index    0.0
edge_attr     0.0
num_nodes     0.0
dtype: float64

In [6]:
dataframe = dataframe.drop(columns=['Smiles', 'ecfp1', 'ecfp2', 'ecfp3'])

In [7]:
def preprocess_array_column(df, column):
    for row in tqdm(range(len(df))):
        str_ints = eval(df.iloc[row][column])
        df.at[row, column] = str_ints

In [8]:
preprocess_array_column(dataframe, 'node_feat')
preprocess_array_column(dataframe, 'edge_index')
preprocess_array_column(dataframe, 'edge_attr')
preprocess_array_column(dataframe, 'y')

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

### Normalize target

In [None]:
dataframe['y'] = dataframe['y'].apply(lambda x: x[0])

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # отображает данные в отрезок [0, 1]
dataframe['y'] = scaler.fit_transform(dataframe['y'].to_numpy().reshape(-1, 1))

In [20]:
dataframe['y'] = dataframe['y'].apply(lambda x: [x])

In [21]:
dataframe['y']

0        [0.06945573071852935]
1         [0.0632708525199993]
2        [0.14947089947089948]
3       [0.048213458054186156]
4        [0.07334422834991665]
                 ...          
9995    [0.044576268125756185]
9996     [0.05272345317851575]
9997     [0.05001926190777158]
9998     [0.05197855908891177]
9999    [0.033902462514521677]
Name: y, Length: 10000, dtype: object

### Create Dataloader

In [12]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(dataframe)
train_testvalid = dataset.train_test_split(test_size=0.2, seed=15)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=15)

# 10% for test, 10 for validation, 80% for train
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['y', 'node_feat', 'edge_index', 'edge_attr', 'num_nodes'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['y', 'node_feat', 'edge_index', 'edge_attr', 'num_nodes'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['y', 'node_feat', 'edge_index', 'edge_attr', 'num_nodes'],
        num_rows: 1000
    })
})

In [13]:
from transformers.models.graphormer.collating_graphormer import preprocess_item, GraphormerDataCollator

dataset_processed = dataset.map(preprocess_item, batched=False)
# data_loader = GraphormerDataCollator(on_the_fly_processing=True) # либо препроцессинг либо коллайтор с on_the_fly_processing

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
dataset_processed.save_to_disk('dataset_10k_graphormer_preprocessed_normilized')

In [18]:
# from datasets import load_from_disk
# dataset_processed = load_from_disk('dataset_10k_graphormer_preprocessed')

### Create Model

In [16]:
# from transformers import AutoModel, AutoConfig

# config = AutoConfig.from_pretrained(model_name)
# AutoModel.from_pretrained(model_name, config=config)

In [17]:
# from transformers import AutoModel, AutoConfig

# class MolecularPropertiesRegression(torch.nn.Module):
#     def __init__(self, model_name, num_properties):
#         super(MolecularPropertiesRegression, self).__init__()
#         self.num_properties = num_properties

#         config = AutoConfig.from_pretrained(model_name)
#         self.transformer = AutoModel.from_pretrained(model_name, config=config)
#         # removing last layer of transformer
#         self.transformer.pooler = torch.nn.Identity()
#         # freezing transformer weights
#         for param in self.transformer.parameters():
#             param.requires_grad = False
#         self.regressor = torch.nn.Linear(768, num_properties)

#     def forward(self, input_ids = None, attention_mask=None):
#         outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
#         last_hidden_state = outputs[0]
#         # last_hidden_state is the shape of (batch_size=32, input_sequence_length=512, hidden_size=768)
#         # so we take only hidden emdedding for [CLS] token (first) as it contains the entire context
#         # and would be sufficient for simple downstream tasks such as classification/regression
#         predicted_property_values = self.regressor(last_hidden_state[:, 0, : ].view(-1, 768))

#         return predicted_property_values
        

### Create DataLoader

In [19]:
from transformers.models.graphormer.collating_graphormer import GraphormerDataCollator

class GraphormerDataCollator_():
    def __init__(self):
        self.data_collator = GraphormerDataCollator()

    def __call__(self, features):
        for mol in features:
            if mol['num_nodes'] == 1:
                features.remove(mol)
        return self.data_collator(features)

In [20]:
from torch.utils.data import DataLoader

data_collator = GraphormerDataCollator_()

train_dataloader = DataLoader(
    dataset_processed['train'], shuffle = False, batch_size = batch_size, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    dataset_processed['validation'], shuffle = False, batch_size = batch_size, collate_fn = data_collator
)

In [20]:
device = torch.device("cuda", index=5) if torch.cuda.is_available() else torch.device('cpu')

In [21]:
from transformers import GraphormerForGraphClassification

model = GraphormerForGraphClassification.from_pretrained(
    model_name, 
    num_classes=1,
    ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
).to(device)

In [22]:
model

GraphormerForGraphClassification(
  (encoder): GraphormerModel(
    (graph_encoder): GraphormerGraphEncoder(
      (dropout_module): Dropout(p=0.0, inplace=False)
      (graph_node_feature): GraphormerGraphNodeFeature(
        (atom_encoder): Embedding(4609, 768, padding_idx=0)
        (in_degree_encoder): Embedding(512, 768, padding_idx=0)
        (out_degree_encoder): Embedding(512, 768, padding_idx=0)
        (graph_token): Embedding(1, 768)
      )
      (graph_attn_bias): GraphormerGraphAttnBias(
        (edge_encoder): Embedding(1537, 32, padding_idx=0)
        (edge_dis_encoder): Embedding(131072, 1)
        (spatial_pos_encoder): Embedding(512, 32, padding_idx=0)
        (graph_token_virtual_distance): Embedding(1, 32)
      )
      (emb_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x GraphormerGraphEncoderLayer(
          (dropout_module): Dropout(p=0.0, inplace=False)
          (activation_dropout_module): Dr

In [23]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epoch = 100

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

loss_func = torch.nn.MSELoss()



In [24]:
wandb.init(
    project="graphormer",
    name="Graphormer Simple Classification on MolecularWeight 10k 100_epochs",
    config={}
)

[34m[1mwandb[0m: Currently logged in as: [33morlov-aleksei53[0m ([33mmoleculary-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Training

In [25]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader)))

for epoch in range(num_epoch):
    model.train()
    train_epoch_loss = 0
    for batch in train_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() }
        
        outputs = model(**input_batch)
        
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        loss.backward()
        train_epoch_loss += loss.item()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    model.eval()
    eval_epoch_loss = 0
    for batch in eval_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() }

        with torch.no_grad():
            outputs = model(**input_batch)

        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        eval_epoch_loss += loss.item()

        progress_bar_eval.update(1)
    
    wandb.log({"loss/train": train_epoch_loss / len(train_dataloader), "loss/validation": eval_epoch_loss / len(eval_dataloader)})

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [26]:
torch.onnx.export(model, input_batch, model_name_base + "_10k_100_epochs.onnx")
wandb.save(model_name_base + "_10k_100_epochs.onnx")

  if not (embedding_dim == self.embedding_dim):
  if not (list(query.size()) == [tgt_len, bsz, embedding_dim]):
  if (key_bsz != bsz) or (value is None) or not (src_len, bsz == value.shape[:2]):
  if (k is None) or not (k.size(1) == src_len):
  if key_padding_mask.size(0) != bsz or key_padding_mask.size(1) != src_len:
  if list(attn_weights.size()) != [bsz * self.num_heads, tgt_len, src_len]:
  if list(attn.size()) != [bsz * self.num_heads, tgt_len, self.head_dim]:


['/home/alexeyorlov53/Transformers-for-Molecules/graphormer/wandb/run-20240412_005415-uxqlwcb8/files/graphormer-base-pcqm4mv1_10k_10_epochs.onnx']

## Post Training Evaluation

In [27]:
test_dataloader = DataLoader(
    dataset_processed['test'], batch_size = batch_size, collate_fn = data_collator
)

model.eval()
epoch_loss = 0
for batch in tqdm(test_dataloader):
        input_batch = { k: v.to(device) for k, v in batch.items() }

        with torch.no_grad():
            outputs = model(**input_batch)

        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        epoch_loss += loss.item()
    
wandb.log({"loss/test": epoch_loss / len(test_dataloader)})

  0%|          | 0/500 [00:00<?, ?it/s]

In [28]:
wandb.finish()

VBox(children=(Label(value='181.616 MB of 181.616 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss/test,▁
loss/train,█▃▂▂▂▂▁▁▁▁
loss/validation,█▃▂▂▅▃▁▁▁▁

0,1
loss/test,38086.50964
loss/train,35185.91918
loss/validation,46093.69094


In [None]:
model.save_pretrained(model_name_base + '_10k_100epochs unnormalized')

In [29]:
# torch.save(model, model_name_base + '_10k_10epochs.pt')

In [30]:
torch.cuda.empty_cache()