In [None]:
!pip install rdkit dgl

In [None]:
import pandas as pd
from rdkit import Chem

import dgl
import dgl.function as fn

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import time, math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
ATOM_VOCAB = [
	'C', 'N', 'O', 'S', 'F',
	'H', 'Si', 'P', 'Cl', 'Br',
	'Li', 'Na', 'K', 'Mg', 'Ca',
	'Fe', 'As', 'Al', 'I', 'B',
	'V', 'Tl', 'Sb', 'Sn', 'Ag',
	'Pd', 'Co', 'Se', 'Ti', 'Zn',
	'Ge', 'Cu', 'Au', 'Ni', 'Cd',
	'Mn', 'Cr', 'Pt', 'Hg', 'Pb'
]

def one_of_k_encoding(x, vocab):
  if x not in vocab:
    x = vocab[-1]
  return list(map(lambda s: float(x==s), vocab))

def get_atom_feature(atom):
	atom_feature = one_of_k_encoding(atom.GetSymbol(), ATOM_VOCAB)   #c면 [1,0,0,0,....] n이면[0,1,0,0,0,.....]
	atom_feature += one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5]) #다리가 몇갠지
	atom_feature += one_of_k_encoding(atom.GetTotalNumHs(), [0, 1, 2, 3, 4])
	atom_feature += one_of_k_encoding(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5])
	atom_feature += [atom.GetIsAromatic()]
	return atom_feature

def get_bond_feature(bond):
  bt = bond.GetBondType()
  bond_feature = [
      bt == Chem.rdchem.BondType.SINGLE,
      bt == Chem.rdchem.BondType.DOUBLE,
      bt == Chem.rdchem.BondType.TRIPLE,
      bt == Chem.rdchem.BondType.AROMATIC,
      bond.GetIsConjugated(),  #결합체인가
      bond.IsInRing()           #링안에 있는지
  ]
  return bond_feature

In [None]:
def get_molecular_graph(smi):
  mol = Chem.MolFromSmiles(smi)
  graph = dgl.DGLGraph()

  atom_list = mol.GetAtoms()
  num_atoms = len(atom_list)
  graph.add_nodes(num_atoms)

  atom_feature_list = [get_atom_feature(atom) for atom in atom_list]
  atom_feature_list = torch.tensor(atom_feature_list, dtype=torch.float64)
  graph.ndata['h'] = atom_feature_list

  bond_list = mol.GetBonds()
  bond_feature_list = []
  for bond in bond_list:
    bond_feature = get_bond_feature(bond)

    src = bond.GetBeginAtom().GetIdx() #엣지 시작점
    dst = bond.GetEndAtom().GetIdx()   #끝점

    ## DGL 그래프는 방향성이 없어
    ## 쌍으로 줘야함
    # i --> j
    graph.add_edges(src, dst)
    bond_feature_list.append(bond_feature)
    # j --> i
    graph.add_edges(dst, src)
    bond_feature_list.append(bond_feature)

  bond_feature_list = torch.tensor(bond_feature_list, dtype = torch.float64)
  graph.edata['e_ij'] = bond_feature_list
  return graph

In [None]:
class MyDataset(Dataset):
  def __init__(self, df, hlm=True, test=False):
    self.smi_list = list(df['SMILES'])
    self.hlm_list = list(df['HLM'])
    self.mlm_list = list(df['MLM'])
    self.hlm = hlm
  def __len__(self):
    return len(self.smi_list)

  def __getitem__(self, idx):
    if self.hlm == True:
      return self.smi_list[idx], self.hlm_list[idx]
    elif self.hlm == False:
      return self.smi_list[idx], self.mlm_list[idx]
    elif self.test == True:
      return self.smi_list[idx]

In [None]:
def my_collate_fn(batch):
	graph_list = []
	label_list = []
	for item in batch:
		smi = item[0]
		label = item[1]
		graph = get_molecular_graph(smi)
		graph_list.append(graph)
		label_list.append(label)
	graph_list = dgl.batch(graph_list)
	label_list = torch.tensor(label_list, dtype=torch.float64)
	return graph_list, label_list

In [None]:
x_train, x_test= train_test_split(train_df, test_size = 0.2, random_state=42)

In [None]:
train_hlm_set = MyDataset(x_train)
train_mlm_set = MyDataset(x_train,hlm=False)
test_hlm_set = MyDataset(x_test)
test_mlm_set = MyDataset(x_test,hlm=False)

In [None]:
train_hlm_loader = DataLoader(dataset = train_hlm_set, batch_size=16, shuffle=True, collate_fn = my_collate_fn)
test_hlm_loader = DataLoader(dataset = test_hlm_set, batch_size=16, shuffle=False, collate_fn = my_collate_fn)
train_mlm_loader = DataLoader(dataset = train_mlm_set, batch_size=16, shuffle=True, collate_fn = my_collate_fn)
test_mlm_loader = DataLoader(dataset = test_mlm_set, batch_size=16, shuffle=False, collate_fn = my_collate_fn)

In [None]:
def evaluate_regression(
		y_list,
		pred_list,
	):
	y_list = torch.cat(y_list, dim=0).detach().cpu().numpy()
	pred_list = torch.cat(pred_list, dim=0).detach().cpu().numpy()

	mse = mean_squared_error(y_list, pred_list)
	rmse = math.sqrt(mse)
	r2 = r2_score(y_list, pred_list)
	return mse, rmse, r2

In [None]:
class MLP(nn.Module):
	def __init__(
		self,
		input_dim,
		hidden_dim,
		output_dim,
		bias=True,
		act=F.relu,
	):
		super().__init__()

		self.input_dim = input_dim
		self.hidden_dim = hidden_dim
		self.output_dim = output_dim

		self.act = act

		self.linear1 = nn.Linear(input_dim, hidden_dim, bias=bias)
		self.linear2 = nn.Linear(hidden_dim, output_dim, bias=bias)

	def forward(self, h):
		h = self.linear1(h)
		h = self.act(h)
		h = self.linear2(h)
		return h

class GraphConvolution(nn.Module):
  def __init__(self,hidden_dim=64,act=F.relu,dropout_prob=0.2):
    super().__init__()
    self.mlp = MLP(input_dim= hidden_dim, hidden_dim= 4*hidden_dim, output_dim=hidden_dim)
    self.norm = nn.LayerNorm(hidden_dim)
    self.prob = dropout_prob
  def forward(self, graph):
    h0 = graph.ndata['h']
    graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'u_'))
    h = self.mlp(graph.ndata['u_']) + h0
    h = self.norm(h)

    h = F.dropout(h, p= self.prob)

    graph.ndata['h'] = h
    return graph

class MyModel(nn.Module):
  def __init__(self, num_layers=4, hidden_dim=64, dropout_prob=0.2, readout='sum', act=F.relu, initial_node_dim=58, initial_edge_dim=6):
    super().__init__()
    self.num_layers = num_layers
    self.embedding_node = nn.Linear(initial_node_dim, hidden_dim, bias=False)
    self.embedding_edge = nn.Linear(initial_edge_dim, hidden_dim, bias=False)
    self.readout = readout

    self.mp_layers = torch.nn.ModuleList()

    for _ in range(self.num_layers):
      mp_layer = None
      mp_layer = GraphConvolution()
      self.mp_layers.append(mp_layer)

    self.linear_out = nn.Linear(hidden_dim, 1, bias=False)

  def forward(self, graph):
    h = self.embedding_node(graph.ndata['h'].float())
    e_ij = self.embedding_edge(graph.edata['e_ij'].float())
    graph.ndata['h'] = h
    graph.edata['e_ij'] = e_ij

    for i in range(self.num_layers):
      graph = self.mp_layers[i](graph=graph)

    out = dgl.readout_nodes(graph, 'h', op=self.readout)
    out = self.linear_out(out)
    return out

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [None]:
def train(train_loader, test_loader, model, num_epoches=100):
  # model = MyModel()
  model = model.to(device)
  optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.0001,
    weight_decay=0.0,
  )
  scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer=optimizer,
    step_size=40,
    gamma=0.1,
  )
  mse_loss = nn.MSELoss()
  for epoch in range(num_epoches):
  # Train
    model.train()
    num_batches = len(train_loader)
    train_loss = 0
    y_list = []
    pred_list = []
    for i, batch in enumerate(train_loader):
      st = time.time()
      optimizer.zero_grad()
      graph, y = batch[0], batch[1]
      graph = graph.to(device)
      y = y.to(device)
      y = y.float()
      pred = model(graph).squeeze()
      y_list.append(y)
      pred_list.append(pred)

      loss = mse_loss(pred, y)
      loss.backward()
      optimizer.step()

      train_loss += loss.detach().cpu().numpy()

      et = time.time()
      print ("Train!!! Epoch:", epoch+1, \
            "\t Batch:", i+1, '/', num_batches, \
            "\t Loss:", loss.detach().cpu().numpy(), \
            "\t Time spent:", round(et-st, 2), "(s)")
    scheduler.step()
    train_loss /= num_batches
    train_metrics = evaluate_regression(
      y_list=y_list,
      pred_list=pred_list
    )

    model.eval()
    with torch.no_grad():
      # Validation
      valid_loss = 0
      num_batches = len(test_loader)
      y_list = []
      pred_list = []
      for i, batch in enumerate(test_loader):
        st = time.time()

        graph, y = batch[0], batch[1]
        graph = graph.to(device)
        y = y.to(device)
        y = y.float()

        pred = model(graph).squeeze()
        y_list.append(y)
        pred_list.append(pred)

        loss = mse_loss(pred, y)
        valid_loss += loss.detach().cpu().numpy()

        et = time.time()
        print ("Valid!!! Epoch:", epoch+1, \
              "\t Batch:", i+1, '/', num_batches, \
              "\t Loss:", loss.detach().cpu().numpy(), \
                "\t Time spent:", round(et-st, 2), "(s)")
      valid_loss /= num_batches
      valid_metrics = evaluate_regression(
        y_list=y_list,
        pred_list=pred_list
      )
    # torch.save(model,'./model.pt')
    # print('save!!!!!!!!!!!!!!!!!!!')

    print ("End of ", epoch+1, "-th epoch", \
          "mse:", round(train_metrics[0], 3), "\t", round(valid_metrics[0], 3), \
          "rmse:", round(train_metrics[1], 3), "\t", round(valid_metrics[1], 3), \
          "r2:", round(train_metrics[2], 3), "\t", round(valid_metrics[2], 3))
  return model

In [None]:
model_MLM=MyModel()
model_HLM=MyModel()

In [None]:
print("Training Start: HLM")
model_HLM = train(train_hlm_loader, test_hlm_loader, model_HLM)
print("Training Start: MLM")
model_MLM = train(train_mlm_loader, test_mlm_loader, model_MLM)

In [None]:
test = pd.read_csv('test.csv')

In [None]:
class MyDataset_1(Dataset):
  def __init__(self, df):
    self.smi_list = list(df['SMILES'])
  def __len__(self):
    return len(self.smi_list)

  def __getitem__(self, idx):
    return self.smi_list[idx]

In [None]:
def test_collate_fn(batch):
	graph_list = []
	for item in batch:
		smi = item
		graph = get_molecular_graph(smi)
		graph_list.append(graph)
	graph_list = dgl.batch(graph_list)
	return graph_list

In [None]:
data_set = MyDataset_1(test)

In [None]:
hlm_data = test_collate_fn(data_set)
mlm_data = test_collate_fn(data_set)

In [None]:
model_MLM.eval()
mlm = model_MLM(mlm_data)
model_HLM.eval()
hlm = model_HLM(hlm_data)

In [None]:
mlm_list = []
for i in mlm:
  result = float(i)
  mlm_list.append(round(result,2))
hlm_list = []
for i in hlm:
  result = float(i)
  hlm_list.append(round(result,2))

In [None]:
submit = pd.read_csv('sample_submission.csv')
submit['MLM'] = mlm_list
submit['HLM'] = hlm_list
submit.to_csv('sub.csv',encoding='utf-8-sig',index=False)