In [None]:
import random
import numpy as np
import torch
from rdkit import Chem


# for reproducability
rd_seed = 42

torch.manual_seed(rd_seed)
np.random.seed(rd_seed)
random.seed(rd_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import pandas as pd
train_df = pd.read_csv("../input/train.csv")
train_df.head()

In [None]:
train_df.info()

In [None]:
mol = Chem.MolFromSmiles(train_df['SMILES'][1000])
mol

---


In [None]:
from pytorch_tabnet.tab_network import AttentiveTransformer, FeatTransformer
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
import torch.nn as nn

In [None]:
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_smiles # Converts a SMILES string to a torch_geometric.data.Data instance.

In [None]:
class GraphFeature(nn.Module):
    def __init__(self, node_feat, embed_dim):
        super(GraphFeature, self).__init__()

        self.conv_l1 = GCNConv(node_feat, 8)
        self.conv_l2 = GCNConv(8, 16)
        self.embedding = nn.Linear(16, embed_dim)

    def forward(self, x, edge_idx, batch):
        x = F.elu(self.conv_l1(x, edge_idx))
        x = F.elu(self.conv_l2(x, edge_idx))
        x = global_mean_pool(x, batch) # read-out layer

        x = self.embedding(x)
        return x

In [None]:
class GraphNet(nn.Module):
    def __init__(self, graph_dict, num_heads, reg_emb, drop_ratio, out_dim):

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.graph_feature = GraphFeature(graph_dict["node_feat"], graph_dict["embed_dim"])

        self.attn_layer = nn.MultiheadAttention(graph_dict["embed_dim"], num_heads, dropout=drop_ratio)

        self.regressor = nn.Sequential( nn.Linear(graph_dict["embed_dim"] , reg_emb),
                                        nn.BatchNorm1d(reg_emb),
                                        nn.ELU(),
                                        nn.Dropout(p=drop_ratio),
                                        nn.Linear(reg_emb, reg_emb),
                                        nn.ELU(),
                                        nn.Dropout(p=drop_ratio),
                                        nn.Linear(reg_emb, out_dim))
        
    def forward(self, node_attr, edge_idx, batch):
        graph_feat = self.graph_feature(node_attr, edge_idx, batch)
        graph_feat.to(self.device)

        attn_output = self.attn_layer(graph_feat, graph_feat)
        attn_output = attn_output[0]

        res = self.regressor(attn_output)
        return res  

In [None]:
class MultiDataset(Dataset):
    def __init__(self, dataFrame):
        super(MultiDataset, self).__init__()

        self.train_df = dataFrame
        self.train_df.fillna(0, inplace=True)
        self.graph_list = self.smiles2mol(train_df["SMILES"])

        self.target_mlm = torch.tensor(self.train_df["MLM"].values.astype(np.float32))
        self.target_hlm = torch.tensor(self.train_df["HLM"].values.astype(np.float32))

        self.wo_smiles_df = self.train_df.drop(columns=["SMILES", "id", "MLM", "HLM"], axis=1)
        self.wo_smiles_df = torch.tensor(self.wo_smiles_df.values.astype(np.float32))

    
    def smiles2mol(self, smiles_list):
        print('Convert "SMILES" data to mol')

        graph_list = []
        for smiles in tqdm(smiles_list):
            graph_data = from_smiles(smiles)

            graph_data.smiles = None
            graph_data.edge_attr = None

            graph_list.append(graph_data)

        print('Complete!')
        return graph_list
    
    def __getitem__(self, idx):
        return self.graph_list[idx], self.wo_smiles_df[idx], self.target_mlm[idx], self.target_hlm[idx]
    
    def __len__(self):
        return len(self.graph_list)


In [None]:
train_dataset = MultiDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=128)