In [14]:
import torch
import pandas as pd
from torch_geometric.datasets import MoleculeNet
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from torch.utils.data import Subset
from torch_geometric.utils.convert import to_networkx
from networkx import all_pairs_shortest_path
import numpy as np
from torch_geometric.utils.smiles import from_smiles

In [15]:
path = r"C:\Users\rhys-\OneDrive\data_hnrs\spectra\nist\nist_exp_spectra_fixed.csv"
df = pd.read_csv(path)



In [30]:
input_data = from_smiles(df['smiles'][0])
input_data
labels = np.arange(400,4002,2).astype(str)
dataset = []

for i in range(len(df)):
    input_data = from_smiles(df['smiles'][i])

    dataset.append(input_data)


for x in dataset:
    x.y = torch.tensor((df[labels].iloc[i].values), dtype=torch.float32)
    # x.x = x.x.to(torch.float32)
    # x.edge_index = x.edge_index.to(torch.float32) 
    # x.edge_attr = x.edge_attr.to(torch.float32)
# create a DataLoader
loader = DataLoader(dataset, batch_size=32, shuffle=True)

dataset

[Data(x=[14, 9], edge_index=[2, 28], edge_attr=[28, 3], smiles='C=CC(=O)OCCOc1ccccc1', y=[1801]),
 Data(x=[10, 9], edge_index=[2, 22], edge_attr=[22, 3], smiles='Nc1ccc2[nH]ccc2c1', y=[1801]),
 Data(x=[10, 9], edge_index=[2, 20], edge_attr=[20, 3], smiles='Cc1ccc(CO)cc1Cl', y=[1801]),
 Data(x=[8, 9], edge_index=[2, 14], edge_attr=[14, 3], smiles='CCC(O)CC(C)C', y=[1801]),
 Data(x=[11, 9], edge_index=[2, 22], edge_attr=[22, 3], smiles='COc1ccc(N=C=S)cc1', y=[1801]),
 Data(x=[12, 9], edge_index=[2, 26], edge_attr=[26, 3], smiles='Cc1ccc2ccc(Cl)cc2n1', y=[1801]),
 Data(x=[12, 9], edge_index=[2, 24], edge_attr=[24, 3], smiles='S=C=NCCCc1ccccc1', y=[1801]),
 Data(x=[13, 9], edge_index=[2, 24], edge_attr=[24, 3], smiles='CCCCCCSCCCCCC', y=[1801]),
 Data(x=[13, 9], edge_index=[2, 26], edge_attr=[26, 3], smiles='O=Cc1cccc(C(F)(F)F)c1F', y=[1801]),
 Data(x=[12, 9], edge_index=[2, 24], edge_attr=[24, 3], smiles='CCCC(C#N)c1ccccc1', y=[1801]),
 Data(x=[18, 9], edge_index=[2, 36], edge_attr=[36, 3

In [18]:
from torch import nn
import torch_geometric.nn as tgnn
from graphormer.model import Graphormer


model = Graphormer(
    num_layers=3,
    input_node_dim=input_data.num_node_features,
    node_dim=128,
    input_edge_dim=input_data.num_edge_features,
    edge_dim=128,
    output_dim=1801,
    n_heads=4,
    max_in_degree=5,
    max_out_degree=5,
    max_path_distance=5,
)

In [19]:
model

Graphormer(
  (node_in_lin): Linear(in_features=9, out_features=128, bias=True)
  (edge_in_lin): Linear(in_features=3, out_features=128, bias=True)
  (centrality_encoding): CentralityEncoding()
  (spatial_encoding): SpatialEncoding()
  (layers): ModuleList(
    (0-2): 3 x GraphormerEncoderLayer(
      (attention): GraphormerMultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x GraphormerAttentionHead(
            (edge_encoding): EdgeEncoding()
            (q): Linear(in_features=128, out_features=128, bias=True)
            (k): Linear(in_features=128, out_features=128, bias=True)
            (v): Linear(in_features=128, out_features=128, bias=True)
          )
        )
        (linear): Linear(in_features=512, out_features=128, bias=True)
      )
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ff): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (node_out_l

In [23]:
from sklearn.model_selection import train_test_split

test_ids, train_ids = train_test_split([i for i in range(len(dataset))], test_size=0.8, random_state=42)
train_loader = DataLoader(Subset(dataset, train_ids), batch_size=64)
test_loader = DataLoader(Subset(dataset, test_ids), batch_size=64)

In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
# loss_functin = nn.L1Loss(reduction="sum")
loss_functin = nn.MSELoss(reduction="sum")

In [29]:
from tqdm import tqdm
from torch_geometric.nn.pool import global_mean_pool

DEVICE = "cuda"
# DEVICE = "cpu"

model.to(DEVICE)
for epoch in range(3):
    model.train()
    batch_loss = 0.0
    for batch in tqdm(train_loader):
        batch.to(DEVICE)
        print(batch)
        y = batch.y
        print(y.shape)
        optimizer.zero_grad()
        output = global_mean_pool(model(batch), batch.batch)
        print(output.shape)
        loss = loss_functin(output, y)
        batch_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    print("TRAIN_LOSS", batch_loss / len(train_ids))

    model.eval()
    batch_loss = 0.0
    for batch in tqdm(test_loader):
        batch.to(DEVICE)
        y = batch.y
        with torch.no_grad():
            output = global_mean_pool(model(batch), batch.batch)
            loss = loss_functin(output, y)
            
        batch_loss += loss.item()

    print("EVAL LOSS", batch_loss / len(test_ids))

    

  0%|          | 0/96 [00:00<?, ?it/s]

DataBatch(x=[760, 9], edge_index=[2, 1520], edge_attr=[1520, 3], smiles=[64], y=[115264], batch=[760], ptr=[65])
torch.Size([115264])


  0%|          | 0/96 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [27]:
for batch in tqdm(train_loader):
        print(batch)
        break

  0%|          | 0/96 [00:00<?, ?it/s]

DataBatch(x=[760, 9], edge_index=[2, 1520], edge_attr=[1520, 3], smiles=[64], y=[115264], batch=[760], ptr=[65])





In [None]:
model.to(DEVICE)
for epoch in range(3):
    model.train()
    print('1')
    batch_loss = 0.0
    for batch in tqdm(train_loader):
        batch.to(DEVICE)
        y = batch.y
        print('2')

        optimizer.zero_grad()
        print('3')

        output = global_mean_pool(model(batch), batch.batch)
        print('4')


1


  0%|          | 0/96 [00:00<?, ?it/s]

2
3


  1%|          | 1/96 [00:35<56:53, 35.93s/it]

4
2
3


  1%|          | 1/96 [00:40<1:04:53, 40.99s/it]


KeyboardInterrupt: 